Skip to content

Commit

Permalink
Merge pull request #405 from NannyML/mc_auroc
Browse files Browse the repository at this point in the history
Update AUROC checks for CBPE and realized performance
  • Loading branch information
nnansters authored Jul 18, 2024
2 parents 7b6db6f + d353276 commit a121838
Show file tree
Hide file tree
Showing 6 changed files with 183 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,6 @@ def __init__(
Name(s) of the column(s) containing your model output. For binary classification, pass a single string
refering to the model output column.
"""

if normalize_business_value not in [None, "per_prediction"]:
raise InvalidArgumentsException(
f"normalize_business_value must be None or 'per_prediction', but got {normalize_business_value}"
Expand Down Expand Up @@ -863,6 +862,7 @@ def __init__(
self._sampling_error_components: Tuple = ()

def __str__(self):
"""Get string representation of metric."""
return "confusion_matrix"

def fit(self, reference_data: pd.DataFrame, chunker: Chunker):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
)
from sklearn.preprocessing import LabelBinarizer, label_binarize

from nannyml._typing import ProblemType, class_labels, model_output_column_names
from nannyml._typing import ProblemType, class_labels
from nannyml.base import _list_missing, common_nan_removal
from nannyml.chunk import Chunker
from nannyml.exceptions import InvalidArgumentsException
Expand Down Expand Up @@ -84,29 +84,42 @@ def __init__(
upper_threshold_limit=1,
components=[("ROC AUC", "roc_auc")],
)
# FIXME: Should we check the y_pred_proba argument here to ensure it's a dict?
self.y_pred_proba: Dict[str, str]

# sampling error
self.classes: List[str] = [""]
self.class_probability_columns: List[str]
self._sampling_error_components: List[Tuple] = []

def __str__(self):
"""Get string representation of metric."""
return "roc_auc"

def _fit(self, reference_data: pd.DataFrame):
classes = class_labels(self.y_pred_proba)
class_y_pred_proba_columns = model_output_column_names(self.y_pred_proba)
_list_missing([self.y_true] + class_y_pred_proba_columns, list(reference_data.columns))
# set up sorted classes and prob_column_names to use across metric class
self.classes = class_labels(self.y_pred_proba)
self.class_probability_columns = [self.y_pred_proba[clazz] for clazz in self.classes]

_list_missing([self.y_true] + self.class_probability_columns, list(reference_data.columns))
reference_data, empty = common_nan_removal(
reference_data[[self.y_true] + class_y_pred_proba_columns], [self.y_true] + class_y_pred_proba_columns
reference_data[[self.y_true] + self.class_probability_columns],
[self.y_true] + self.class_probability_columns
)
if empty:
self._sampling_error_components = [(np.NaN, 0) for class_col in class_y_pred_proba_columns]
self._sampling_error_components = [(np.NaN, 0) for clasz in self.classes]
# TODO: Ideally we would also raise an error here!
else:
# test if reference data are represented correctly
observed_classes = set(reference_data[self.y_true].unique())
if not observed_classes == set(self.classes):
self._logger.error(
"The specified classification classes are not the same as the classes observed in the reference"
"targets."
)
raise InvalidArgumentsException(
"y_pred_proba class and class probabilities dictionary does not match reference data.")

# sampling error
binarized_y_true = list(label_binarize(reference_data[self.y_true], classes=classes).T)
y_pred_proba = [reference_data[self.y_pred_proba[clazz]].T for clazz in classes]
binarized_y_true = list(label_binarize(reference_data[self.y_true], classes=self.classes).T)
y_pred_proba = [reference_data[self.y_pred_proba[clazz]].T for clazz in self.classes]
self._sampling_error_components = auroc_sampling_error_components(
y_true_reference=binarized_y_true, y_pred_proba_reference=y_pred_proba
)
Expand All @@ -119,37 +132,34 @@ def _calculate(self, data: pd.DataFrame):
"be a dictionary mapping classes to columns."
)

class_y_pred_proba_columns = model_output_column_names(self.y_pred_proba)
_list_missing([self.y_true] + class_y_pred_proba_columns, data)
_list_missing([self.y_true] + self.class_probability_columns, data)
data, empty = common_nan_removal(
data[[self.y_true] + class_y_pred_proba_columns], [self.y_true] + class_y_pred_proba_columns
data[[self.y_true] + self.class_probability_columns], [self.y_true] + self.class_probability_columns
)
if empty:
warnings.warn(f"Too many missing values, cannot calculate {self.display_name}. " f"Returning NaN.")
_message = f"Too many missing values, cannot calculate {self.display_name}. " f"Returning NaN."
self._logger.warning(_message)
warnings.warn(_message)
return np.NaN

labels, class_probability_columns = [], []
for label in sorted(list(self.y_pred_proba.keys())):
labels.append(label)
class_probability_columns.append(self.y_pred_proba[label])

y_true = data[self.y_true]
y_pred_proba = data[class_probability_columns]
y_pred_proba = data[self.class_probability_columns]

if y_true.nunique() <= 1:
warnings.warn(
f"'{self.y_true}' only contains a single class for chunk, cannot calculate {self.display_name}. "
if set(y_true.unique()) != set(self.classes):
_message = (
f"'{self.y_true}' does not contain all reported classes, cannot calculate {self.display_name}. "
"Returning NaN."
)
warnings.warn(_message)
self._logger.warning(_message)
return np.NaN
else:
return roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro', labels=labels)
return roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro', labels=self.classes)

def _sampling_error(self, data: pd.DataFrame) -> float:
class_y_pred_proba_columns = model_output_column_names(self.y_pred_proba)
_list_missing([self.y_true] + class_y_pred_proba_columns, data)
_list_missing([self.y_true] + self.class_probability_columns, data)
data, empty = common_nan_removal(
data[[self.y_true] + class_y_pred_proba_columns], [self.y_true] + class_y_pred_proba_columns
data[[self.y_true] + self.class_probability_columns], [self.y_true] + self.class_probability_columns
)
if empty:
warnings.warn(
Expand Down
7 changes: 4 additions & 3 deletions nannyml/performance_estimation/confidence_based/cbpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,11 +541,12 @@ def _fit_calibrators(
noop_calibrator = NoopCalibrator()

for clazz, y_true, y_pred_proba in _get_class_splits(reference_data, y_true_col, y_pred_proba_col):
_calibrator = copy.deepcopy(calibrator)
if not needs_calibration(np.asarray(y_true), np.asarray(y_pred_proba), calibrator):
calibrator = noop_calibrator
_calibrator = noop_calibrator

calibrator.fit(y_pred_proba, y_true)
fitted_calibrators[clazz] = copy.deepcopy(calibrator)
_calibrator.fit(y_pred_proba, y_true)
fitted_calibrators[clazz] = copy.deepcopy(_calibrator)

return fitted_calibrators

Expand Down
58 changes: 33 additions & 25 deletions nannyml/performance_estimation/confidence_based/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2327,36 +2327,43 @@ def __init__(
threshold=threshold,
components=[('ROC AUC', 'roc_auc')],
)
# FIXME: Should we check the y_pred_proba argument here to ensure it's a dict?
self.y_pred_proba: Dict[str, str]

# sampling error
self.classes: List[str] = [""]
self.class_probability_columns: List[str]
self.class_uncalibrated_y_pred_proba_columns: List[str]
self._sampling_error_components: List[Tuple] = []

def _fit(self, reference_data: pd.DataFrame):
classes = class_labels(self.y_pred_proba)
class_y_pred_proba_columns = model_output_column_names(self.y_pred_proba)
class_uncalibrated_y_pred_proba_columns = ['uncalibrated_' + el for el in class_y_pred_proba_columns]
_list_missing([self.y_true] + class_uncalibrated_y_pred_proba_columns, list(reference_data.columns))
self.classes = class_labels(self.y_pred_proba)
self.class_probability_columns = [self.y_pred_proba[clazz] for clazz in self.classes]
self.class_uncalibrated_y_pred_proba_columns = ['uncalibrated_' + el for el in self.class_probability_columns]
_list_missing([self.y_true] + self.class_uncalibrated_y_pred_proba_columns, list(reference_data.columns))
# filter nans here
reference_data, empty = common_nan_removal(
reference_data[[self.y_true] + class_uncalibrated_y_pred_proba_columns],
[self.y_true] + class_uncalibrated_y_pred_proba_columns,
reference_data[[self.y_true] + self.class_uncalibrated_y_pred_proba_columns],
[self.y_true] + self.class_uncalibrated_y_pred_proba_columns,
)
if empty:
self._sampling_error_components = [(np.NaN, 0) for class_col in class_y_pred_proba_columns]
self._sampling_error_components = [(np.NaN, 0) for clasz in self.classes]
else:
# test if reference data are represented correctly
observed_classes = set(reference_data[self.y_true].unique())
if not observed_classes == set(self.classes):
self._logger.error(
"The specified classification classes are not the same as the classes observed in the reference"
"targets."
)
raise InvalidArgumentsException(
"y_pred_proba class and class probabilities dictionary does not match reference data.")
# sampling error
binarized_y_true = list(label_binarize(reference_data[self.y_true], classes=classes).T)
y_pred_proba = [reference_data['uncalibrated_' + self.y_pred_proba[clazz]].T for clazz in classes]
binarized_y_true = list(label_binarize(reference_data[self.y_true], classes=self.classes).T)
y_pred_proba = [reference_data['uncalibrated_' + self.y_pred_proba[clazz]].T for clazz in self.classes]
self._sampling_error_components = mse.auroc_sampling_error_components(
y_true_reference=binarized_y_true, y_pred_proba_reference=y_pred_proba
)

def _estimate(self, data: pd.DataFrame):
class_y_pred_proba_columns = model_output_column_names(self.y_pred_proba)
class_uncalibrated_y_pred_proba_columns = ['uncalibrated_' + el for el in class_y_pred_proba_columns]
needed_columns = class_y_pred_proba_columns + class_uncalibrated_y_pred_proba_columns
needed_columns = self.class_probability_columns + self.class_uncalibrated_y_pred_proba_columns
try:
_list_missing(needed_columns, list(data.columns))
except InvalidArgumentsException as ex:
Expand Down Expand Up @@ -2390,9 +2397,7 @@ def _estimate(self, data: pd.DataFrame):
return multiclass_roc_auc

def _sampling_error(self, data: pd.DataFrame) -> float:
class_y_pred_proba_columns = model_output_column_names(self.y_pred_proba)
class_uncalibrated_y_pred_proba_columns = ['uncalibrated_' + el for el in class_y_pred_proba_columns]
needed_columns = class_y_pred_proba_columns + class_uncalibrated_y_pred_proba_columns
needed_columns = self.class_probability_columns + self.class_uncalibrated_y_pred_proba_columns
_list_missing(needed_columns, data)
data, empty = common_nan_removal(data[needed_columns], needed_columns)
if empty:
Expand All @@ -2404,25 +2409,28 @@ def _sampling_error(self, data: pd.DataFrame) -> float:
return mse.auroc_sampling_error(self._sampling_error_components, data)

def _realized_performance(self, data: pd.DataFrame) -> float:
class_y_pred_proba_columns = model_output_column_names(self.y_pred_proba)
class_uncalibrated_y_pred_proba_columns = ['uncalibrated_' + el for el in class_y_pred_proba_columns]
try:
_list_missing([self.y_true] + class_uncalibrated_y_pred_proba_columns, data)
_list_missing([self.y_true] + self.class_uncalibrated_y_pred_proba_columns, data)
except InvalidArgumentsException as ex:
if "missing required columns" in str(ex):
self._logger.debug(str(ex))
return np.NaN
else:
raise ex

data, empty = common_nan_removal(data, [self.y_true] + class_uncalibrated_y_pred_proba_columns)
data, empty = common_nan_removal(data, [self.y_true] + self.class_uncalibrated_y_pred_proba_columns)
if empty:
warnings.warn(f"Too many missing values, cannot calculate {self.display_name}. " f"Returning NaN.")
return np.NaN

y_true = data[self.y_true]
if y_true.nunique() <= 1:
warnings.warn("Too few unique values present in 'y_true', returning NaN as realized ROC-AUC.")
if set(y_true.unique()) != set(self.classes):
_message = (
f"'{self.y_true}' does not contain all reported classes, cannot calculate {self.display_name}. "
"Returning NaN."
)
warnings.warn(_message)
self._logger.warning(_message)
return np.NaN

_, y_pred_probas, labels = _get_multiclass_uncalibrated_predictions(data, self.y_pred, self.y_pred_proba)
Expand Down Expand Up @@ -3158,7 +3166,7 @@ def _multi_class_confusion_matrix_realized_performance(self, data: pd.DataFrame)
warnings.warn(
f"Too few unique values present in 'y_pred', returning NaN as realized {self.display_name} score."
)
return nan_array
return nan_array

cm = confusion_matrix(
data[self.y_true], data[self.y_pred], labels=self.classes, normalize=self.normalize_confusion_matrix
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
# Author: Niels Nuyttens <niels@nannyml.com>
# #
# License: Apache Software License 2.0
# Author: Niels Nuyttens <niels@nannyml.com>
#
# License: Apache Software License 2.0


"""Unit tests for performance metrics."""
from typing import Tuple

import pandas as pd
import pytest
from logging import getLogger

from nannyml import PerformanceCalculator
from nannyml._typing import ProblemType
Expand All @@ -27,6 +25,8 @@
)
from nannyml.thresholds import ConstantThreshold, StandardDeviationThreshold

LOGGER = getLogger(__name__)


@pytest.fixture(scope='module')
def multiclass_data() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: # noqa: D103
Expand Down Expand Up @@ -94,7 +94,7 @@ def no_timestamp_metrics(performance_calculator, multiclass_data) -> pd.DataFram
def test_metric_factory_returns_correct_metric_given_key_and_problem_type(key, problem_type, metric): # noqa: D103
calc = PerformanceCalculator(
timestamp_column_name='timestamp',
y_pred_proba='y_pred_proba',
y_pred_proba={'class1': 'y_pred_proba1', 'class2': 'y_pred_proba2', 'class3': 'y_pred_proba3'},
y_pred='y_pred',
y_true='y_true',
metrics=['roc_auc', 'f1'],
Expand Down Expand Up @@ -229,3 +229,49 @@ def test_metric_logs_warning_when_upper_threshold_is_overridden_by_metric_limits
f'{metric.display_name} upper threshold value 2 overridden by '
f'upper threshold value limit {metric.upper_threshold_value_limit}' in caplog.messages
)


def test_auroc_errors_out_when_not_all_classes_are_represented_reference(multiclass_data, caplog):
LOGGER.info("testing test_auroc_errors_out_when_not_all_classes_are_represented_reference")
reference, _, _ = multiclass_data
reference['y_pred_proba_clazz'] = reference['y_pred_proba_upmarket_card']
performance_calculator = PerformanceCalculator(
y_pred_proba={
'prepaid_card': 'y_pred_proba_prepaid_card',
'highstreet_card': 'y_pred_proba_highstreet_card',
'upmarket_card': 'y_pred_proba_upmarket_card',
'clazz': 'y_pred_proba_clazz'
},
y_pred='y_pred',
y_true='y_true',
metrics=['roc_auc'],
problem_type='classification_multiclass',
)
performance_calculator.fit(reference)
expected_exc_test = "y_pred_proba class and class probabilities dictionary does not match reference data."
assert expected_exc_test in caplog.text


def test_auroc_errors_out_when_not_all_classes_are_represented_chunk(multiclass_data, caplog):
LOGGER.info("testing test_auroc_errors_out_when_not_all_classes_are_represented_chunk")
reference, monitored, targets = multiclass_data
monitored = monitored.merge(targets)
reference['y_pred_proba_clazz'] = reference['y_pred_proba_upmarket_card']
monitored['y_pred_proba_clazz'] = monitored['y_pred_proba_upmarket_card']
reference['y_true'].iloc[-1000:] = 'clazz'
performance_calculator = PerformanceCalculator(
y_pred_proba={
'prepaid_card': 'y_pred_proba_prepaid_card',
'highstreet_card': 'y_pred_proba_highstreet_card',
'upmarket_card': 'y_pred_proba_upmarket_card',
'clazz': 'y_pred_proba_clazz'
},
y_pred='y_pred',
y_true='y_true',
metrics=['roc_auc'],
problem_type='classification_multiclass',
)
performance_calculator.fit(reference)
_ = performance_calculator.calculate(monitored)
expected_exc_test = "does not contain all reported classes, cannot calculate"
assert expected_exc_test in caplog.text
Loading

0 comments on commit a121838

Please sign in to comment.