From 2f1bed45891fcc657e82e6982a5833f6593c61ab Mon Sep 17 00:00:00 2001 From: jnesfield Date: Tue, 9 Jul 2024 17:49:57 -0700 Subject: [PATCH 01/21] work in progress check point --- nannyml/data_quality/range/calculator.py | 267 +++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 nannyml/data_quality/range/calculator.py diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py new file mode 100644 index 00000000..df2cac90 --- /dev/null +++ b/nannyml/data_quality/range/calculator.py @@ -0,0 +1,267 @@ +# Author: James Nesfield +# +# License: Apache Software License 2.0 + +"""Continous numerical variable range monitor to ensure range supplied is within training bounds.""" + +from typing import Any, Dict, List, Optional, Union + +import numpy as np +import pandas as pd +from pandas import MultiIndex + +from nannyml.base import AbstractCalculator, _list_missing, _split_features_by_type +from nannyml.chunk import Chunker +from nannyml.exceptions import InvalidArgumentsException +from nannyml.sampling_error import SAMPLING_ERROR_RANGE +from nannyml.thresholds import StandardDeviationThreshold, Threshold, calculate_threshold_values +from nannyml.usage_logging import UsageEvent, log_usage + +from .result import Result + +""" +Missing Values Data Quality Module. +""" + + +class NumericalRangeCalculator(AbstractCalculator): + """NumericalRangeCalculator implementation to ensure inference data numerical ranges match training.""" + + def __init__( + self, + column_names: Union[str, List[str]], + normalize: bool = True, + timestamp_column_name: Optional[str] = None, + chunk_size: Optional[int] = None, + chunk_number: Optional[int] = None, + chunk_period: Optional[str] = None, + chunker: Optional[Chunker] = None, + threshold: Threshold = StandardDeviationThreshold(), + ): + """Creates a new NumericalRangeCalculator instance. + + Parameters + ---------- + column_names: Union[str, List[str]] + A string or list containing the names of features in the provided data set. + Missing Values will be calculated for each entry in this list. + normalize: bool, default=True + Whether to provide the missing value ratio (True) or the absolute number of missing values (False). + timestamp_column_name: str + The name of the column containing the timestamp of the model prediction. + chunk_size: int + Splits the data into chunks containing `chunks_size` observations. + Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given. + chunk_number: int + Splits the data into `chunk_number` pieces. + Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given. + chunk_period: str + Splits the data according to the given period. + Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given. + chunker : Chunker + The `Chunker` used to split the data sets into a lists of chunks. + threshold: Threshold, default=StandardDeviationThreshold + The threshold you wish to evaluate values on. Defaults to a StandardDeviationThreshold with default + options. The other available value is ConstantThreshold. + + + Examples + -------- + >>> import nannyml as nml + >>> reference_df, analysis_df, _ = nml.load_synthetic_car_price_dataset() + >>> feature_column_names = [col for col in reference_df.columns if col not in ['timestamp', 'y_pred', 'y_true']] + >>> calc = nml.MissingValuesCalculator( + ... column_names=feature_column_names, + ... timestamp_column_name='timestamp', + ... ).fit(reference_df) + >>> res = calc.calculate(analysis_df) + >>> for column_name in res.feature_column_names: + ... res = res.filter(period='analysis', column_name=column_name).plot().show() + """ + super(NumericalRangeCalculator, self).__init__( + chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name + ) + if isinstance(column_names, str): + self.column_names = [column_names] + elif isinstance(column_names, list): + for el in column_names: + if not isinstance(el, str): + raise InvalidArgumentsException( + f"column_names elements should be either a column name string or a list of strings, found\n{el}" + ) + self.column_names = column_names + else: + raise InvalidArgumentsException( + "column_names should be either a column name string or a list of columns names strings, " + "found\n{column_names}" + ) + self.result: Optional[Result] = None + self.lower_threshold_value_limit: float = 0 + self.upper_threshold_value_limit: float + self.normalize = normalize + if self.normalize: + self.data_quality_metric = 'out_of_range_values_rate' + self.upper_threshold_value_limit = 1 + else: + self.data_quality_metric = 'out_of_range_values_count' + self.upper_threshold_value_limit = np.nan + + #object tracks values as list [min,max] + self._continuous_val_ranges: Dict[str, list] = {column_name: list() for column_name in self.column_names} + + + def _calculate_out_of_range_stats(self, data: pd.Series, lower_bound: float, upper_bound: float): + # to do make this calc out of range stats + count_tot = data.shape[0] + count_out_of_range = ((data < lower_bound) | (data > upper_bound)).sum() + if self.normalize: + count_out_of_range = count_out_of_range / count_tot + return count_out_of_range + + @log_usage(UsageEvent.DQ_CALC_MISSING_VALUES_FIT, metadata_from_self=['normalize']) + def _fit(self, reference_data: pd.DataFrame, *args, **kwargs): + """Fits the drift calculator to a set of reference data.""" + if reference_data.empty: + raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.') + + _list_missing(self.column_names, reference_data) + + # All provided columns must be continuous + # We do not make int categorical + continuous_column_names, categorical_column_names = _split_features_by_type(reference_data, self.column_names) + if not set(self.column_names) == set(continuous_column_names): + raise InvalidArgumentsException( + f"Specified columns_names for NumericalRangeCalculator must all be continuous.\n" + f"Categorical columns found:\n{categorical_column_names}" + ) + + for col in self.column_names: + self._continuous_val_ranges[col] = [reference_data[col].min(), reference_data[col].max()] + + return self + + @log_usage(UsageEvent.DQ_CALC_VALUE_RANGES, metadata_from_self=['normalize']) + def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result: + """Calculates methods for both categorical and continuous columns.""" + if data.empty: + raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.') + + _list_missing(self.column_names, data) + + chunks = self.chunker.split(data) + + rows = [] + for chunk in chunks: + row = { + 'key': chunk.key, + 'chunk_index': chunk.chunk_index, + 'start_index': chunk.start_index, + 'end_index': chunk.end_index, + 'start_datetime': chunk.start_datetime, + 'end_datetime': chunk.end_datetime, + 'period': 'analysis', + } + + for column_name in self.column_names: + for k, v in self._calculate_for_column(chunk.data, column_name).items(): + row[f'{column_name}_{k}'] = v + + rows.append(row) + + result_index = _create_multilevel_index( + column_names=self.column_names, + ) + res = pd.DataFrame(rows) + res.columns = result_index + res = res.reset_index(drop=True) + + if self.result is None: + self._set_metric_thresholds(res) + res = self._populate_alert_thresholds(res) + self.result = Result( + results_data=res, + column_names=self.column_names, + data_quality_metric=self.data_quality_metric, + timestamp_column_name=self.timestamp_column_name, + chunker=self.chunker, + ) + else: + # TODO: review subclassing setup => superclass + '_filter' is screwing up typing. + # Dropping the intermediate '_filter' and directly returning the correct 'Result' class works OK + # but this causes us to lose the "common behavior" in the top level 'filter' method when overriding. + # Applicable here but to many of the base classes as well (e.g. fitting and calculating) + res = self._populate_alert_thresholds(res) + self.result = self.result.filter(period='reference') + self.result.data = pd.concat([self.result.data, res], ignore_index=True) + + return self.result + + def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[str, Any]: + result = {} + value, tot = self._calculate_out_of_range_stats(data[column_name]) + result['value'] = value + serr = np.sqrt( + self._sampling_error_components[column_name] * (1 - self._sampling_error_components[column_name]) + ) + if self.normalize: + result['sampling_error'] = serr / np.sqrt(tot) + else: + result['sampling_error'] = serr * np.sqrt(tot) + + result['upper_confidence_boundary'] = np.minimum( + result['value'] + SAMPLING_ERROR_RANGE * result['sampling_error'], + np.inf if self.upper_threshold_value_limit is None else self.upper_threshold_value_limit, + ) + result['lower_confidence_boundary'] = np.maximum( + result['value'] - SAMPLING_ERROR_RANGE * result['sampling_error'], + -np.inf if self.lower_threshold_value_limit is None else self.lower_threshold_value_limit, + ) + return result + + def _set_metric_thresholds(self, result_data: pd.DataFrame): + for column_name in self.column_names: + self._lower_alert_thresholds[column_name], self._upper_alert_thresholds[column_name] = calculate_threshold_values( # noqa: E501 + threshold=self.threshold, + data=result_data.loc[:, (column_name, 'value')], + lower_threshold_value_limit=self.lower_threshold_value_limit, + upper_threshold_value_limit=self.upper_threshold_value_limit, + logger=self._logger, + ) + + def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame: + for column_name in self.column_names: + result_data[(column_name, 'upper_threshold')] = self._upper_alert_thresholds[column_name] + result_data[(column_name, 'lower_threshold')] = self._lower_alert_thresholds[column_name] + result_data[(column_name, 'alert')] = result_data.apply( + lambda row: True + if ( + row[(column_name, 'value')] > ( + np.inf if row[(column_name, 'upper_threshold')] is None else row[(column_name, 'upper_threshold')] # noqa: E501 + ) + or row[(column_name, 'value')] < ( + -np.inf if row[(column_name, 'lower_threshold')] is None else row[(column_name, 'lower_threshold')] # noqa: E501 + ) + ) + else False, + axis=1, + ) + return result_data + + +def _create_multilevel_index( + column_names, +): + chunk_column_names = ['key', 'chunk_index', 'start_index', 'end_index', 'start_date', 'end_date', 'period'] + chunk_tuples = [('chunk', chunk_column_name) for chunk_column_name in chunk_column_names] + column_tuples = [ + (column_name, el) + for column_name in column_names + for el in [ + 'value', + 'sampling_error', + 'upper_confidence_boundary', + 'lower_confidence_boundary', + ] + ] + tuples = chunk_tuples + column_tuples + return MultiIndex.from_tuples(tuples) From fee07c1201be3914fdbc23b6a15cd42d53e377e0 Mon Sep 17 00:00:00 2001 From: jnesfield Date: Tue, 9 Jul 2024 17:53:41 -0700 Subject: [PATCH 02/21] stopping point for checks --- nannyml/data_quality/range/calculator.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index df2cac90..bcfc4e36 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -198,24 +198,9 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result: def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[str, Any]: result = {} - value, tot = self._calculate_out_of_range_stats(data[column_name]) + value_range = self._continuous_val_ranges[column_name] + value = self._calculate_out_of_range_stats(data[column_name], value_range[0],value_range[1]) result['value'] = value - serr = np.sqrt( - self._sampling_error_components[column_name] * (1 - self._sampling_error_components[column_name]) - ) - if self.normalize: - result['sampling_error'] = serr / np.sqrt(tot) - else: - result['sampling_error'] = serr * np.sqrt(tot) - - result['upper_confidence_boundary'] = np.minimum( - result['value'] + SAMPLING_ERROR_RANGE * result['sampling_error'], - np.inf if self.upper_threshold_value_limit is None else self.upper_threshold_value_limit, - ) - result['lower_confidence_boundary'] = np.maximum( - result['value'] - SAMPLING_ERROR_RANGE * result['sampling_error'], - -np.inf if self.lower_threshold_value_limit is None else self.lower_threshold_value_limit, - ) return result def _set_metric_thresholds(self, result_data: pd.DataFrame): From f0e3e2b854b2bd0ac44dd1eac5d0d6daeefe9fff Mon Sep 17 00:00:00 2001 From: jnesfield Date: Tue, 9 Jul 2024 17:59:52 -0700 Subject: [PATCH 03/21] more stuff --- nannyml/data_quality/range/__init__.py | 8 +++ nannyml/data_quality/range/calculator.py | 4 +- nannyml/data_quality/range/result.py | 91 ++++++++++++++++++++++++ 3 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 nannyml/data_quality/range/__init__.py create mode 100644 nannyml/data_quality/range/result.py diff --git a/nannyml/data_quality/range/__init__.py b/nannyml/data_quality/range/__init__.py new file mode 100644 index 00000000..abe09b24 --- /dev/null +++ b/nannyml/data_quality/range/__init__.py @@ -0,0 +1,8 @@ +# Author: James Nesfield +# +# License: Apache Software License 2.0 + +"""Package containing the Data Quality Calculators implementation.""" + +from .calculator import NumericalRangeCalculator +from .result import Result diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index bcfc4e36..ea105aeb 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -20,7 +20,7 @@ from .result import Result """ -Missing Values Data Quality Module. +Values Out Of Range Data Quality Module. """ @@ -70,7 +70,7 @@ def __init__( >>> import nannyml as nml >>> reference_df, analysis_df, _ = nml.load_synthetic_car_price_dataset() >>> feature_column_names = [col for col in reference_df.columns if col not in ['timestamp', 'y_pred', 'y_true']] - >>> calc = nml.MissingValuesCalculator( + >>> calc = nml.NumericalRangeCalculator( ... column_names=feature_column_names, ... timestamp_column_name='timestamp', ... ).fit(reference_df) diff --git a/nannyml/data_quality/range/result.py b/nannyml/data_quality/range/result.py new file mode 100644 index 00000000..2c18cd37 --- /dev/null +++ b/nannyml/data_quality/range/result.py @@ -0,0 +1,91 @@ +# Author: Niels Nuyttens +# Author: Nikolaos Perrakis +# +# License: Apache Software License 2.0 + +"""Contains the results of the univariate statistical drift calculation and provides plotting functionality.""" +from __future__ import annotations + +import warnings +from typing import List, Optional + +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=DeprecationWarning) + import pandas as pd + +import plotly.graph_objects as go + +from nannyml._typing import Key +from nannyml.base import PerColumnResult +from nannyml.chunk import Chunker +from nannyml.plots.blueprints.comparisons import ResultCompareMixin +from nannyml.plots.blueprints.metrics import plot_metrics +from nannyml.usage_logging import UsageEvent, log_usage + + +class Result(PerColumnResult, ResultCompareMixin): + """Values Out Of Range Result Class. + + Contains calculation results and provides plotting functionality. + """ + + def __init__( + self, + results_data: pd.DataFrame, + column_names: List[str], + data_quality_metric: str, + timestamp_column_name: Optional[str], + chunker: Chunker, + ): + """Values Out Of Range Result Class.""" + super().__init__(results_data, column_names) + + self.timestamp_column_name = timestamp_column_name + self.data_quality_metric = data_quality_metric + self.chunker = chunker + + def keys(self) -> List[Key]: # noqa: D102 + return [ + Key( + properties=(column_name,), + display_names=(column_name, f"{self.data_quality_metric.replace('_', ' ').title()}"), + ) + for column_name in self.column_names + ] + + @log_usage(UsageEvent.DQ_CALC_VALUES_OUT_OF_RANGE_PLOT) + def plot( + self, + *args, + **kwargs, + ) -> go.Figure: + """Values Out Of Range results. + + Returns + ------- + fig: :class:`plotly.graph_objs._figure.Figure` + A :class:`~plotly.graph_objs._figure.Figure` object containing the requested drift plot. + + Can be saved to disk using the :meth:`~plotly.graph_objs._figure.Figure.write_image` method + or shown rendered on screen using the :meth:`~plotly.graph_objs._figure.Figure.show` method. + + Examples + -------- + >>> import nannyml as nml + >>> reference, analysis, _ = nml.load_synthetic_car_price_dataset() + >>> column_names = [col for col in reference.columns if col not in ['timestamp', 'y_pred', 'y_true']] + >>> calc = nml.NumericalRangeCalculator( + ... column_names=column_names, + ... timestamp_column_name='timestamp', + ... ).fit(reference) + >>> res = calc.calculate(analysis) + >>> for column_name in res.column_names: + ... res = res.filter(period='analysis', column_name=column_name).plot().show() + + """ + return plot_metrics( + self, + title='Data Quality ', + subplot_title_format='{display_names[1]} for {display_names[0]}', + subplot_y_axis_title_format='{display_names[1]}', + ) From 03eeacdde2ea4a660cbfa369457e4ebd7e711f30 Mon Sep 17 00:00:00 2001 From: jnesfield Date: Wed, 10 Jul 2024 19:30:20 -0700 Subject: [PATCH 04/21] updated excluded cols for examples --- nannyml/data_quality/__init__.py | 1 + nannyml/data_quality/range/calculator.py | 2 +- nannyml/data_quality/range/result.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/nannyml/data_quality/__init__.py b/nannyml/data_quality/__init__.py index 497061fe..dc5761b3 100644 --- a/nannyml/data_quality/__init__.py +++ b/nannyml/data_quality/__init__.py @@ -7,3 +7,4 @@ from .missing import MissingValuesCalculator from .unseen import UnseenValuesCalculator +from .range import NumericalRangeCalculator \ No newline at end of file diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index ea105aeb..b6c5c981 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -69,7 +69,7 @@ def __init__( -------- >>> import nannyml as nml >>> reference_df, analysis_df, _ = nml.load_synthetic_car_price_dataset() - >>> feature_column_names = [col for col in reference_df.columns if col not in ['timestamp', 'y_pred', 'y_true']] + >>> feature_column_names = [col for col in reference_df.columns if col not in ['fuel','transmission','timestamp', 'y_pred', 'y_true']] >>> calc = nml.NumericalRangeCalculator( ... column_names=feature_column_names, ... timestamp_column_name='timestamp', diff --git a/nannyml/data_quality/range/result.py b/nannyml/data_quality/range/result.py index 2c18cd37..26436f5a 100644 --- a/nannyml/data_quality/range/result.py +++ b/nannyml/data_quality/range/result.py @@ -73,7 +73,7 @@ def plot( -------- >>> import nannyml as nml >>> reference, analysis, _ = nml.load_synthetic_car_price_dataset() - >>> column_names = [col for col in reference.columns if col not in ['timestamp', 'y_pred', 'y_true']] + >>> column_names = [col for col in reference.columns if col not in ['fuel','transmission','timestamp', 'y_pred', 'y_true']] >>> calc = nml.NumericalRangeCalculator( ... column_names=column_names, ... timestamp_column_name='timestamp', From 5a41e80a9543e7917f565ae84bb1a9afe92253d4 Mon Sep 17 00:00:00 2001 From: jnesfield Date: Wed, 10 Jul 2024 19:37:48 -0700 Subject: [PATCH 05/21] updated inits --- nannyml/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nannyml/__init__.py b/nannyml/__init__.py index 24f2c7b2..f9821bcb 100644 --- a/nannyml/__init__.py +++ b/nannyml/__init__.py @@ -39,7 +39,7 @@ from .calibration import Calibrator, IsotonicCalibrator, needs_calibration from .chunk import Chunk, Chunker, CountBasedChunker, DefaultChunker, PeriodBasedChunker, SizeBasedChunker -from .data_quality import MissingValuesCalculator, UnseenValuesCalculator +from .data_quality import MissingValuesCalculator, UnseenValuesCalculator, NumericalRangeCalculator from .datasets import ( load_modified_california_housing_dataset, load_synthetic_binary_classification_dataset, From e24427a5be28be30797d558d15b63f498b0872d2 Mon Sep 17 00:00:00 2001 From: jnesfield Date: Wed, 10 Jul 2024 20:07:06 -0700 Subject: [PATCH 06/21] fixed errors --- nannyml/__init__.py | 2 +- nannyml/data_quality/range/calculator.py | 4 ++-- nannyml/usage_logging.py | 4 ++++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/nannyml/__init__.py b/nannyml/__init__.py index f9821bcb..459473ff 100644 --- a/nannyml/__init__.py +++ b/nannyml/__init__.py @@ -31,7 +31,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.10.7' +__version__ = '0.10.8' import logging diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index b6c5c981..cdcb1841 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -118,7 +118,7 @@ def _calculate_out_of_range_stats(self, data: pd.Series, lower_bound: float, upp count_out_of_range = count_out_of_range / count_tot return count_out_of_range - @log_usage(UsageEvent.DQ_CALC_MISSING_VALUES_FIT, metadata_from_self=['normalize']) + @log_usage(UsageEvent.DQ_CALC_VALUES_OUT_OF_RANGE_FIT, metadata_from_self=['normalize']) def _fit(self, reference_data: pd.DataFrame, *args, **kwargs): """Fits the drift calculator to a set of reference data.""" if reference_data.empty: @@ -140,7 +140,7 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs): return self - @log_usage(UsageEvent.DQ_CALC_VALUE_RANGES, metadata_from_self=['normalize']) + @log_usage(UsageEvent.DQ_CALC_VALUES_OUT_OF_RANGE_RUN, metadata_from_self=['normalize']) def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result: """Calculates methods for both categorical and continuous columns.""" if data.empty: diff --git a/nannyml/usage_logging.py b/nannyml/usage_logging.py index ec025bc2..f3004527 100644 --- a/nannyml/usage_logging.py +++ b/nannyml/usage_logging.py @@ -70,6 +70,10 @@ class UsageEvent(str, Enum): DQ_CALC_UNSEEN_VALUES_RUN = "Data Quality Calculator Unseen Values run" DQ_CALC_UNSEEN_VALUES_PLOT = "Data Quality Calculator Unseen Values plot" + DQ_CALC_VALUES_OUT_OF_RANGE_FIT = "Data Quality Calculator Values Out Of Range fit" + DQ_CALC_VALUES_OUT_OF_RANGE_RUN = "Data Quality Calculator Values Out Of Range run" + DQ_CALC_VALUES_OUT_OF_RANGE_PLOT = "Data Quality Calculator Values Out Of Range Plot" + UNIVAR_DRIFT_CALC_FIT = "Univariate drift calculator fit" UNIVAR_DRIFT_CALC_RUN = "Univariate drift calculator run" UNIVAR_DRIFT_PLOT = "Univariate drift results plot" From 52cc34142c1f6c38583d089d98ff56b603a8b54f Mon Sep 17 00:00:00 2001 From: jnesfield Date: Wed, 10 Jul 2024 20:17:12 -0700 Subject: [PATCH 07/21] stuff --- nannyml/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nannyml/__init__.py b/nannyml/__init__.py index 459473ff..f9821bcb 100644 --- a/nannyml/__init__.py +++ b/nannyml/__init__.py @@ -31,7 +31,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.10.8' +__version__ = '0.10.7' import logging From be6f577f1537190118495c06d1b06151e927ee84 Mon Sep 17 00:00:00 2001 From: jnesfield Date: Thu, 11 Jul 2024 06:38:33 -0700 Subject: [PATCH 08/21] updated multi lvl indexing to mirror unseen v missing --- nannyml/data_quality/range/calculator.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index cdcb1841..5c8ba193 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -239,14 +239,9 @@ def _create_multilevel_index( chunk_column_names = ['key', 'chunk_index', 'start_index', 'end_index', 'start_date', 'end_date', 'period'] chunk_tuples = [('chunk', chunk_column_name) for chunk_column_name in chunk_column_names] column_tuples = [ - (column_name, el) + (column_name, 'value') for column_name in column_names - for el in [ - 'value', - 'sampling_error', - 'upper_confidence_boundary', - 'lower_confidence_boundary', - ] + # for el in ['value', 'upper_threshold', 'lower_threshold', 'alert'] ] tuples = chunk_tuples + column_tuples return MultiIndex.from_tuples(tuples) From 3073429bf3b504fcfec7045b1f6230bb4191d2d9 Mon Sep 17 00:00:00 2001 From: jnesfield Date: Thu, 11 Jul 2024 07:16:47 -0700 Subject: [PATCH 09/21] corrected thresholds issue --- nannyml/data_quality/range/calculator.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index 5c8ba193..7d699bf8 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -96,9 +96,16 @@ def __init__( "found\n{column_names}" ) self.result: Optional[Result] = None + + # threshold strategy is the same across all columns + self.threshold = threshold + self._upper_alert_thresholds: Dict[str, Optional[float]] = {column_name: 0 for column_name in self.column_names} + self._lower_alert_thresholds: Dict[str, Optional[float]] = {column_name: 0 for column_name in self.column_names} + self.lower_threshold_value_limit: float = 0 - self.upper_threshold_value_limit: float + self.upper_threshold_value_limit: Optional[float] = None self.normalize = normalize + if self.normalize: self.data_quality_metric = 'out_of_range_values_rate' self.upper_threshold_value_limit = 1 From ae0067cd34e0efc1bc8cbde3ada8b74fce4d0219 Mon Sep 17 00:00:00 2001 From: jnesfield Date: Thu, 11 Jul 2024 13:47:42 -0700 Subject: [PATCH 10/21] sampling error fix --- nannyml/data_quality/range/calculator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index 7d699bf8..8481cc3e 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -96,6 +96,7 @@ def __init__( "found\n{column_names}" ) self.result: Optional[Result] = None + self._sampling_error_components: Dict[str, float] = {column_name: 0 for column_name in self.column_names} # threshold strategy is the same across all columns self.threshold = threshold From 36a4147f79dcb818a406a4c4c3e1b829b9167b3f Mon Sep 17 00:00:00 2001 From: jnesfield Date: Thu, 11 Jul 2024 13:48:55 -0700 Subject: [PATCH 11/21] corrected examples and sampling error omission --- nannyml/data_quality/missing/calculator.py | 2 +- nannyml/data_quality/range/calculator.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nannyml/data_quality/missing/calculator.py b/nannyml/data_quality/missing/calculator.py index 657989ac..e635da82 100644 --- a/nannyml/data_quality/missing/calculator.py +++ b/nannyml/data_quality/missing/calculator.py @@ -76,7 +76,7 @@ def __init__( ... timestamp_column_name='timestamp', ... ).fit(reference_df) >>> res = calc.calculate(analysis_df) - >>> for column_name in res.feature_column_names: + >>> for column_name in res.column_names: ... res = res.filter(period='analysis', column_name=column_name).plot().show() """ super(MissingValuesCalculator, self).__init__( diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index 8481cc3e..a46a69b2 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -75,7 +75,7 @@ def __init__( ... timestamp_column_name='timestamp', ... ).fit(reference_df) >>> res = calc.calculate(analysis_df) - >>> for column_name in res.feature_column_names: + >>> for column_name in res.column_names: ... res = res.filter(period='analysis', column_name=column_name).plot().show() """ super(NumericalRangeCalculator, self).__init__( @@ -97,7 +97,7 @@ def __init__( ) self.result: Optional[Result] = None self._sampling_error_components: Dict[str, float] = {column_name: 0 for column_name in self.column_names} - + # threshold strategy is the same across all columns self.threshold = threshold self._upper_alert_thresholds: Dict[str, Optional[float]] = {column_name: 0 for column_name in self.column_names} From c580a848fb87a35d2f712b51698c5d7a046fa526 Mon Sep 17 00:00:00 2001 From: jnesfield Date: Thu, 11 Jul 2024 14:09:59 -0700 Subject: [PATCH 12/21] fixed more issues, updated examples --- nannyml/data_quality/missing/calculator.py | 2 +- nannyml/data_quality/missing/result.py | 2 +- nannyml/data_quality/range/calculator.py | 2 +- nannyml/data_quality/range/result.py | 8 +++++++- nannyml/data_quality/unseen/calculator.py | 4 ++-- nannyml/data_quality/unseen/result.py | 6 +++--- 6 files changed, 15 insertions(+), 9 deletions(-) diff --git a/nannyml/data_quality/missing/calculator.py b/nannyml/data_quality/missing/calculator.py index e635da82..3a280c54 100644 --- a/nannyml/data_quality/missing/calculator.py +++ b/nannyml/data_quality/missing/calculator.py @@ -77,7 +77,7 @@ def __init__( ... ).fit(reference_df) >>> res = calc.calculate(analysis_df) >>> for column_name in res.column_names: - ... res = res.filter(period='analysis', column_name=column_name).plot().show() + ... _ = res.filter(period='analysis', column_name=column_name).plot().show() """ super(MissingValuesCalculator, self).__init__( chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name diff --git a/nannyml/data_quality/missing/result.py b/nannyml/data_quality/missing/result.py index 0f723e7c..866ab916 100644 --- a/nannyml/data_quality/missing/result.py +++ b/nannyml/data_quality/missing/result.py @@ -80,7 +80,7 @@ def plot( ... ).fit(reference) >>> res = calc.calculate(analysis) >>> for column_name in res.column_names: - ... res = res.filter(period='analysis', column_name=column_name).plot().show() + ... _ = res.filter(period='analysis', column_name=column_name).plot().show() """ return plot_metrics( diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index a46a69b2..4a0ef070 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -76,7 +76,7 @@ def __init__( ... ).fit(reference_df) >>> res = calc.calculate(analysis_df) >>> for column_name in res.column_names: - ... res = res.filter(period='analysis', column_name=column_name).plot().show() + ... _ = res.filter(period='analysis', column_name=column_name).plot().show() """ super(NumericalRangeCalculator, self).__init__( chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name diff --git a/nannyml/data_quality/range/result.py b/nannyml/data_quality/range/result.py index 26436f5a..943e69a6 100644 --- a/nannyml/data_quality/range/result.py +++ b/nannyml/data_quality/range/result.py @@ -80,12 +80,18 @@ def plot( ... ).fit(reference) >>> res = calc.calculate(analysis) >>> for column_name in res.column_names: - ... res = res.filter(period='analysis', column_name=column_name).plot().show() + ... _ = res.filter(period='analysis', column_name=column_name).plot().show() """ return plot_metrics( self, title='Data Quality ', + hover=Hover( + template='%{period}     %{alert}
' + 'Chunk: %{chunk_key}     %{x_coordinate}
' + '%{metric_name}: %{metric_value}', + show_extra=True, + ), subplot_title_format='{display_names[1]} for {display_names[0]}', subplot_y_axis_title_format='{display_names[1]}', ) diff --git a/nannyml/data_quality/unseen/calculator.py b/nannyml/data_quality/unseen/calculator.py index db8561f0..9859fc0d 100644 --- a/nannyml/data_quality/unseen/calculator.py +++ b/nannyml/data_quality/unseen/calculator.py @@ -69,14 +69,14 @@ def __init__( -------- >>> import nannyml as nml >>> reference, analysis, _ = nml.load_synthetic_car_price_dataset() - >>> column_names = [col for col in reference.columns if col not in ['timestamp', 'y_pred', 'y_true']] + >>> column_names = [col for col in reference.columns if col not in ['car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] >>> calc = nml.UnseenValuesCalculator( ... column_names=column_names, ... timestamp_column_name='timestamp', ... ).fit(reference) >>> res = calc.calculate(analysis) >>> for column_name in res.column_names: - ... res = res.filter(period='analysis', column_name=column_name).plot().show() + ... _ = res.filter(period='analysis', column_name=column_name).plot().show() """ super(UnseenValuesCalculator, self).__init__( chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name diff --git a/nannyml/data_quality/unseen/result.py b/nannyml/data_quality/unseen/result.py index 262951fa..f57bc852 100644 --- a/nannyml/data_quality/unseen/result.py +++ b/nannyml/data_quality/unseen/result.py @@ -75,14 +75,14 @@ def plot( -------- >>> import nannyml as nml >>> reference, analysis, _ = nml.load_synthetic_car_price_dataset() - >>> column_names = [col for col in reference.columns if col not in ['timestamp', 'y_pred', 'y_true']] - >>> calc = nml.MissingValuesCalculator( + >>> column_names = [col for col in reference.columns if col not in ['car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] + >>> calc = nml.UnseenValuesCalculator( ... column_names=column_names, ... timestamp_column_name='timestamp', ... ).fit(reference) >>> res = calc.calculate(analysis) >>> for column_name in res.column_names: - ... res = res.filter(period='analysis', column_name=column_name).plot().show() + ... _ = res.filter(period='analysis', column_name=column_name).plot().show() """ From 14ce704f9f3b1398975200f0ff345c39f8de1c8b Mon Sep 17 00:00:00 2001 From: jnesfield Date: Thu, 11 Jul 2024 14:10:48 -0700 Subject: [PATCH 13/21] removed unecessary sampling error code in range --- nannyml/data_quality/range/calculator.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index 4a0ef070..fa33de53 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -13,7 +13,6 @@ from nannyml.base import AbstractCalculator, _list_missing, _split_features_by_type from nannyml.chunk import Chunker from nannyml.exceptions import InvalidArgumentsException -from nannyml.sampling_error import SAMPLING_ERROR_RANGE from nannyml.thresholds import StandardDeviationThreshold, Threshold, calculate_threshold_values from nannyml.usage_logging import UsageEvent, log_usage @@ -96,7 +95,6 @@ def __init__( "found\n{column_names}" ) self.result: Optional[Result] = None - self._sampling_error_components: Dict[str, float] = {column_name: 0 for column_name in self.column_names} # threshold strategy is the same across all columns self.threshold = threshold From 614b29597e60edf45ac1198d65dc79fe68fccd23 Mon Sep 17 00:00:00 2001 From: jnesfield Date: Thu, 11 Jul 2024 15:13:52 -0700 Subject: [PATCH 14/21] fixed plotting issues and issues in examples --- nannyml/data_quality/missing/calculator.py | 3 +-- nannyml/data_quality/missing/result.py | 3 +-- nannyml/data_quality/range/calculator.py | 3 +-- nannyml/data_quality/range/result.py | 26 ++++++++++++---------- nannyml/data_quality/unseen/calculator.py | 3 +-- nannyml/data_quality/unseen/result.py | 3 +-- 6 files changed, 19 insertions(+), 22 deletions(-) diff --git a/nannyml/data_quality/missing/calculator.py b/nannyml/data_quality/missing/calculator.py index 3a280c54..8ab3c8d5 100644 --- a/nannyml/data_quality/missing/calculator.py +++ b/nannyml/data_quality/missing/calculator.py @@ -76,8 +76,7 @@ def __init__( ... timestamp_column_name='timestamp', ... ).fit(reference_df) >>> res = calc.calculate(analysis_df) - >>> for column_name in res.column_names: - ... _ = res.filter(period='analysis', column_name=column_name).plot().show() + >>> res.filter(period='analysis').plot().show() """ super(MissingValuesCalculator, self).__init__( chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name diff --git a/nannyml/data_quality/missing/result.py b/nannyml/data_quality/missing/result.py index 866ab916..d40b32e9 100644 --- a/nannyml/data_quality/missing/result.py +++ b/nannyml/data_quality/missing/result.py @@ -79,8 +79,7 @@ def plot( ... timestamp_column_name='timestamp', ... ).fit(reference) >>> res = calc.calculate(analysis) - >>> for column_name in res.column_names: - ... _ = res.filter(period='analysis', column_name=column_name).plot().show() + >>> res.filter(period='analysis').plot().show() """ return plot_metrics( diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index fa33de53..8b14f21d 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -74,8 +74,7 @@ def __init__( ... timestamp_column_name='timestamp', ... ).fit(reference_df) >>> res = calc.calculate(analysis_df) - >>> for column_name in res.column_names: - ... _ = res.filter(period='analysis', column_name=column_name).plot().show() + >>> res.filter(period='analysis').plot().show() """ super(NumericalRangeCalculator, self).__init__( chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name diff --git a/nannyml/data_quality/range/result.py b/nannyml/data_quality/range/result.py index 943e69a6..28c02df8 100644 --- a/nannyml/data_quality/range/result.py +++ b/nannyml/data_quality/range/result.py @@ -18,16 +18,16 @@ from nannyml._typing import Key from nannyml.base import PerColumnResult from nannyml.chunk import Chunker + +# from nannyml.exceptions import InvalidArgumentsException from nannyml.plots.blueprints.comparisons import ResultCompareMixin from nannyml.plots.blueprints.metrics import plot_metrics +from nannyml.plots.components import Hover from nannyml.usage_logging import UsageEvent, log_usage class Result(PerColumnResult, ResultCompareMixin): - """Values Out Of Range Result Class. - - Contains calculation results and provides plotting functionality. - """ + """Contains the results of the univariate statistical drift calculation and provides plotting functionality.""" def __init__( self, @@ -37,14 +37,13 @@ def __init__( timestamp_column_name: Optional[str], chunker: Chunker, ): - """Values Out Of Range Result Class.""" super().__init__(results_data, column_names) self.timestamp_column_name = timestamp_column_name self.data_quality_metric = data_quality_metric self.chunker = chunker - def keys(self) -> List[Key]: # noqa: D102 + def keys(self) -> List[Key]: return [ Key( properties=(column_name,), @@ -53,13 +52,16 @@ def keys(self) -> List[Key]: # noqa: D102 for column_name in self.column_names ] - @log_usage(UsageEvent.DQ_CALC_VALUES_OUT_OF_RANGE_PLOT) + @log_usage(UsageEvent.DQ_CALC_UNSEEN_VALUES_PLOT) def plot( self, *args, **kwargs, ) -> go.Figure: - """Values Out Of Range results. + """ + + Parameters + ---------- Returns ------- @@ -73,16 +75,16 @@ def plot( -------- >>> import nannyml as nml >>> reference, analysis, _ = nml.load_synthetic_car_price_dataset() - >>> column_names = [col for col in reference.columns if col not in ['fuel','transmission','timestamp', 'y_pred', 'y_true']] - >>> calc = nml.NumericalRangeCalculator( + >>> column_names = [col for col in reference.columns if col not in ['car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] + >>> calc = nml.UnseenValuesCalculator( ... column_names=column_names, ... timestamp_column_name='timestamp', ... ).fit(reference) >>> res = calc.calculate(analysis) - >>> for column_name in res.column_names: - ... _ = res.filter(period='analysis', column_name=column_name).plot().show() + >>> res.filter(period='analysis').plot().show() """ + return plot_metrics( self, title='Data Quality ', diff --git a/nannyml/data_quality/unseen/calculator.py b/nannyml/data_quality/unseen/calculator.py index 9859fc0d..15605aef 100644 --- a/nannyml/data_quality/unseen/calculator.py +++ b/nannyml/data_quality/unseen/calculator.py @@ -75,8 +75,7 @@ def __init__( ... timestamp_column_name='timestamp', ... ).fit(reference) >>> res = calc.calculate(analysis) - >>> for column_name in res.column_names: - ... _ = res.filter(period='analysis', column_name=column_name).plot().show() + >>> res.filter(period='analysis').plot().show() """ super(UnseenValuesCalculator, self).__init__( chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name diff --git a/nannyml/data_quality/unseen/result.py b/nannyml/data_quality/unseen/result.py index f57bc852..28c02df8 100644 --- a/nannyml/data_quality/unseen/result.py +++ b/nannyml/data_quality/unseen/result.py @@ -81,8 +81,7 @@ def plot( ... timestamp_column_name='timestamp', ... ).fit(reference) >>> res = calc.calculate(analysis) - >>> for column_name in res.column_names: - ... _ = res.filter(period='analysis', column_name=column_name).plot().show() + >>> res.filter(period='analysis').plot().show() """ From d2da0714759a94953a9faaf08d1e0226b0f01e56 Mon Sep 17 00:00:00 2001 From: jnesfield Date: Thu, 11 Jul 2024 15:22:43 -0700 Subject: [PATCH 15/21] stuff --- nannyml/data_quality/range/result.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nannyml/data_quality/range/result.py b/nannyml/data_quality/range/result.py index 28c02df8..c9b57bf7 100644 --- a/nannyml/data_quality/range/result.py +++ b/nannyml/data_quality/range/result.py @@ -84,7 +84,6 @@ def plot( >>> res.filter(period='analysis').plot().show() """ - return plot_metrics( self, title='Data Quality ', From f6bf29ce53e70aa7c2cc8413db185369c3651225 Mon Sep 17 00:00:00 2001 From: Niels Nuyttens Date: Fri, 19 Jul 2024 11:27:34 +0200 Subject: [PATCH 16/21] Replace the `StandardDeviationThreshold` by `ConstantThreshold` Because it fits the use case better! --- nannyml/data_quality/range/calculator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index 8b14f21d..9539ae1d 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -13,7 +13,7 @@ from nannyml.base import AbstractCalculator, _list_missing, _split_features_by_type from nannyml.chunk import Chunker from nannyml.exceptions import InvalidArgumentsException -from nannyml.thresholds import StandardDeviationThreshold, Threshold, calculate_threshold_values +from nannyml.thresholds import Threshold, calculate_threshold_values, ConstantThreshold from nannyml.usage_logging import UsageEvent, log_usage from .result import Result @@ -35,7 +35,7 @@ def __init__( chunk_number: Optional[int] = None, chunk_period: Optional[str] = None, chunker: Optional[Chunker] = None, - threshold: Threshold = StandardDeviationThreshold(), + threshold: Threshold = ConstantThreshold(lower=None, upper=0), ): """Creates a new NumericalRangeCalculator instance. From 54be1ecac588c28c002b672812d746ade4e586e8 Mon Sep 17 00:00:00 2001 From: Niels Nuyttens Date: Fri, 19 Jul 2024 11:28:52 +0200 Subject: [PATCH 17/21] Add `self._calculate` call during fitting. To ensure the result object also contains results for the reference period. Even if they're all just 0 by definition. --- nannyml/data_quality/range/calculator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index 9539ae1d..47d109c7 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -143,6 +143,9 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs): for col in self.column_names: self._continuous_val_ranges[col] = [reference_data[col].min(), reference_data[col].max()] + self.result = self._calculate(data=reference_data) + self.result.data[('chunk', 'period')] = 'reference' + return self @log_usage(UsageEvent.DQ_CALC_VALUES_OUT_OF_RANGE_RUN, metadata_from_self=['normalize']) From 6100ab878e4be5a357ae0c449ffac37febe5318c Mon Sep 17 00:00:00 2001 From: Niels Nuyttens Date: Fri, 19 Jul 2024 11:43:16 +0200 Subject: [PATCH 18/21] Renaming `_continuous_val_ranges` to `_reference_value_ranges` --- nannyml/data_quality/range/calculator.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index 47d109c7..277cdd49 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -111,9 +111,8 @@ def __init__( self.data_quality_metric = 'out_of_range_values_count' self.upper_threshold_value_limit = np.nan - #object tracks values as list [min,max] - self._continuous_val_ranges: Dict[str, list] = {column_name: list() for column_name in self.column_names} - + # object tracks values as list [min,max] + self._reference_value_ranges: Dict[str, list] = {column_name: list() for column_name in self.column_names} def _calculate_out_of_range_stats(self, data: pd.Series, lower_bound: float, upper_bound: float): # to do make this calc out of range stats @@ -141,7 +140,7 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs): ) for col in self.column_names: - self._continuous_val_ranges[col] = [reference_data[col].min(), reference_data[col].max()] + self._reference_value_ranges[col] = [reference_data[col].min(), reference_data[col].max()] self.result = self._calculate(data=reference_data) self.result.data[('chunk', 'period')] = 'reference' @@ -206,8 +205,8 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result: def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[str, Any]: result = {} - value_range = self._continuous_val_ranges[column_name] - value = self._calculate_out_of_range_stats(data[column_name], value_range[0],value_range[1]) + value_range = self._reference_value_ranges[column_name] + value = self._calculate_out_of_range_stats(data[column_name], value_range[0], value_range[1]) result['value'] = value return result From c8409db743269965cc50d7d2a5556036da4cf92d Mon Sep 17 00:00:00 2001 From: Niels Nuyttens Date: Fri, 19 Jul 2024 11:46:15 +0200 Subject: [PATCH 19/21] Adjusting some comments and general linting stuff Adjusting some comments and general linting stuff --- nannyml/data_quality/__init__.py | 2 +- nannyml/data_quality/range/calculator.py | 31 +++++++++++++++-------- nannyml/data_quality/range/result.py | 3 ++- nannyml/data_quality/unseen/calculator.py | 23 ++++++++++++----- nannyml/data_quality/unseen/result.py | 3 ++- nannyml/usage_logging.py | 2 +- 6 files changed, 43 insertions(+), 21 deletions(-) diff --git a/nannyml/data_quality/__init__.py b/nannyml/data_quality/__init__.py index dc5761b3..cf509fb5 100644 --- a/nannyml/data_quality/__init__.py +++ b/nannyml/data_quality/__init__.py @@ -7,4 +7,4 @@ from .missing import MissingValuesCalculator from .unseen import UnseenValuesCalculator -from .range import NumericalRangeCalculator \ No newline at end of file +from .range import NumericalRangeCalculator diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index 277cdd49..63733c97 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -2,7 +2,7 @@ # # License: Apache Software License 2.0 -"""Continous numerical variable range monitor to ensure range supplied is within training bounds.""" +"""Continuous numerical variable range monitor to ensure range supplied is within training bounds.""" from typing import Any, Dict, List, Optional, Union @@ -15,7 +15,6 @@ from nannyml.exceptions import InvalidArgumentsException from nannyml.thresholds import Threshold, calculate_threshold_values, ConstantThreshold from nannyml.usage_logging import UsageEvent, log_usage - from .result import Result """ @@ -24,7 +23,7 @@ class NumericalRangeCalculator(AbstractCalculator): - """NumericalRangeCalculator implementation to ensure inference data numerical ranges match training.""" + """NumericalRangeCalculator ensures the monitoring data set numerical ranges match the reference data set ones.""" def __init__( self, @@ -68,7 +67,8 @@ def __init__( -------- >>> import nannyml as nml >>> reference_df, analysis_df, _ = nml.load_synthetic_car_price_dataset() - >>> feature_column_names = [col for col in reference_df.columns if col not in ['fuel','transmission','timestamp', 'y_pred', 'y_true']] + >>> feature_column_names = [col for col in reference_df.columns if col not in [ + ... 'fuel','transmission','timestamp', 'y_pred', 'y_true']] >>> calc = nml.NumericalRangeCalculator( ... column_names=feature_column_names, ... timestamp_column_name='timestamp', @@ -117,7 +117,7 @@ def __init__( def _calculate_out_of_range_stats(self, data: pd.Series, lower_bound: float, upper_bound: float): # to do make this calc out of range stats count_tot = data.shape[0] - count_out_of_range = ((data < lower_bound) | (data > upper_bound)).sum() + count_out_of_range = ((data < lower_bound) | (data > upper_bound)).sum() if self.normalize: count_out_of_range = count_out_of_range / count_tot return count_out_of_range @@ -138,7 +138,7 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs): f"Specified columns_names for NumericalRangeCalculator must all be continuous.\n" f"Categorical columns found:\n{categorical_column_names}" ) - + for col in self.column_names: self._reference_value_ranges[col] = [reference_data[col].min(), reference_data[col].max()] @@ -212,7 +212,10 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st def _set_metric_thresholds(self, result_data: pd.DataFrame): for column_name in self.column_names: - self._lower_alert_thresholds[column_name], self._upper_alert_thresholds[column_name] = calculate_threshold_values( # noqa: E501 + ( + self._lower_alert_thresholds[column_name], + self._upper_alert_thresholds[column_name], + ) = calculate_threshold_values( # noqa: E501 threshold=self.threshold, data=result_data.loc[:, (column_name, 'value')], lower_threshold_value_limit=self.lower_threshold_value_limit, @@ -227,11 +230,17 @@ def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame: result_data[(column_name, 'alert')] = result_data.apply( lambda row: True if ( - row[(column_name, 'value')] > ( - np.inf if row[(column_name, 'upper_threshold')] is None else row[(column_name, 'upper_threshold')] # noqa: E501 + row[(column_name, 'value')] + > ( + np.inf + if row[(column_name, 'upper_threshold')] is None + else row[(column_name, 'upper_threshold')] # noqa: E501 ) - or row[(column_name, 'value')] < ( - -np.inf if row[(column_name, 'lower_threshold')] is None else row[(column_name, 'lower_threshold')] # noqa: E501 + or row[(column_name, 'value')] + < ( + -np.inf + if row[(column_name, 'lower_threshold')] is None + else row[(column_name, 'lower_threshold')] # noqa: E501 ) ) else False, diff --git a/nannyml/data_quality/range/result.py b/nannyml/data_quality/range/result.py index c9b57bf7..b6892ebc 100644 --- a/nannyml/data_quality/range/result.py +++ b/nannyml/data_quality/range/result.py @@ -75,7 +75,8 @@ def plot( -------- >>> import nannyml as nml >>> reference, analysis, _ = nml.load_synthetic_car_price_dataset() - >>> column_names = [col for col in reference.columns if col not in ['car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] + >>> column_names = [col for col in reference.columns if col not in [ + ... 'car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] >>> calc = nml.UnseenValuesCalculator( ... column_names=column_names, ... timestamp_column_name='timestamp', diff --git a/nannyml/data_quality/unseen/calculator.py b/nannyml/data_quality/unseen/calculator.py index 15605aef..621582e0 100644 --- a/nannyml/data_quality/unseen/calculator.py +++ b/nannyml/data_quality/unseen/calculator.py @@ -13,6 +13,7 @@ from nannyml.base import AbstractCalculator, _list_missing, _split_features_by_type from nannyml.chunk import Chunker + # from nannyml.data_quality.base import _add_alert_flag from nannyml.exceptions import InvalidArgumentsException from nannyml.thresholds import ConstantThreshold, Threshold, calculate_threshold_values @@ -69,7 +70,8 @@ def __init__( -------- >>> import nannyml as nml >>> reference, analysis, _ = nml.load_synthetic_car_price_dataset() - >>> column_names = [col for col in reference.columns if col not in ['car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] + >>> column_names = [col for col in reference.columns if col not in [ + ... 'car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] >>> calc = nml.UnseenValuesCalculator( ... column_names=column_names, ... timestamp_column_name='timestamp', @@ -217,7 +219,10 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st def _set_metric_thresholds(self, result_data: pd.DataFrame): for column_name in self.column_names: - self._lower_alert_thresholds[column_name], self._upper_alert_thresholds[column_name] = calculate_threshold_values( # noqa: E501 + ( + self._lower_alert_thresholds[column_name], + self._upper_alert_thresholds[column_name], + ) = calculate_threshold_values( # noqa: E501 threshold=self.threshold, data=result_data.loc[:, (column_name, 'value')], lower_threshold_value_limit=self.lower_threshold_value_limit, @@ -232,11 +237,17 @@ def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame: result_data[(column_name, 'alert')] = result_data.apply( lambda row: True if ( - row[(column_name, 'value')] > ( - np.inf if row[(column_name, 'upper_threshold')] is None else row[(column_name, 'upper_threshold')] # noqa: E501 + row[(column_name, 'value')] + > ( + np.inf + if row[(column_name, 'upper_threshold')] is None + else row[(column_name, 'upper_threshold')] # noqa: E501 ) - or row[(column_name, 'value')] < ( - -np.inf if row[(column_name, 'lower_threshold')] is None else row[(column_name, 'lower_threshold')] # noqa: E501 + or row[(column_name, 'value')] + < ( + -np.inf + if row[(column_name, 'lower_threshold')] is None + else row[(column_name, 'lower_threshold')] # noqa: E501 ) ) else False, diff --git a/nannyml/data_quality/unseen/result.py b/nannyml/data_quality/unseen/result.py index 28c02df8..4ba0b710 100644 --- a/nannyml/data_quality/unseen/result.py +++ b/nannyml/data_quality/unseen/result.py @@ -75,7 +75,8 @@ def plot( -------- >>> import nannyml as nml >>> reference, analysis, _ = nml.load_synthetic_car_price_dataset() - >>> column_names = [col for col in reference.columns if col not in ['car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] + >>> column_names = [col for col in reference.columns if col not in [ + .... 'car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] >>> calc = nml.UnseenValuesCalculator( ... column_names=column_names, ... timestamp_column_name='timestamp', diff --git a/nannyml/usage_logging.py b/nannyml/usage_logging.py index f3004527..7e4ea792 100644 --- a/nannyml/usage_logging.py +++ b/nannyml/usage_logging.py @@ -72,7 +72,7 @@ class UsageEvent(str, Enum): DQ_CALC_VALUES_OUT_OF_RANGE_FIT = "Data Quality Calculator Values Out Of Range fit" DQ_CALC_VALUES_OUT_OF_RANGE_RUN = "Data Quality Calculator Values Out Of Range run" - DQ_CALC_VALUES_OUT_OF_RANGE_PLOT = "Data Quality Calculator Values Out Of Range Plot" + DQ_CALC_VALUES_OUT_OF_RANGE_PLOT = "Data Quality Calculator Values Out Of Range Plot" UNIVAR_DRIFT_CALC_FIT = "Univariate drift calculator fit" UNIVAR_DRIFT_CALC_RUN = "Univariate drift calculator run" From fa048d4adc09bcd5354f1d2956b5be18b6b290d8 Mon Sep 17 00:00:00 2001 From: Niels Nuyttens Date: Fri, 19 Jul 2024 14:19:03 +0200 Subject: [PATCH 20/21] Added some tests --- nannyml/data_quality/range/calculator.py | 4 +- tests/data_quality/test_range.py | 161 +++++++++++++++++++++++ 2 files changed, 163 insertions(+), 2 deletions(-) create mode 100644 tests/data_quality/test_range.py diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index 63733c97..fc676559 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -135,8 +135,8 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs): continuous_column_names, categorical_column_names = _split_features_by_type(reference_data, self.column_names) if not set(self.column_names) == set(continuous_column_names): raise InvalidArgumentsException( - f"Specified columns_names for NumericalRangeCalculator must all be continuous.\n" - f"Categorical columns found:\n{categorical_column_names}" + f"Specified columns_names for NumericalRangeCalculator must all be continuous. " + f"Categorical columns found: {categorical_column_names}" ) for col in self.column_names: diff --git a/tests/data_quality/test_range.py b/tests/data_quality/test_range.py new file mode 100644 index 00000000..835a45a2 --- /dev/null +++ b/tests/data_quality/test_range.py @@ -0,0 +1,161 @@ +# Author: Niels Nuyttens +# Author: Nikolaos Perrakis +# +# License: Apache Software License 2.0 + +"""Tests for Numerical Range Data Quality package.""" + +import pandas as pd +import pytest + +from nannyml.data_quality.range import NumericalRangeCalculator, Result +from nannyml.datasets import load_synthetic_car_loan_data_quality_dataset +from nannyml.exceptions import InvalidArgumentsException + +continuous_column_names = ['car_value', 'debt_to_income_ratio', 'loan_length'] + + +@pytest.fixture(scope="module") +def numerical_range_result() -> Result: # noqa: D103 + reference, analysis, _ = load_synthetic_car_loan_data_quality_dataset() + + calc = NumericalRangeCalculator(column_names=continuous_column_names).fit(reference) + return calc.calculate(data=analysis) + + +def test_numerical_range_calculator_with_default_params_should_not_fail(): # noqa: D103 + reference, analysis, _ = load_synthetic_car_loan_data_quality_dataset() + try: + calc = NumericalRangeCalculator(column_names=continuous_column_names).fit(reference) + _ = calc.calculate(data=analysis) + except Exception: + pytest.fail() + + +def test_numerical_range_calculator_raises_invalid_arguments_exception_on_non_continuous_columns(): # noqa: D103 + reference, analysis, _ = load_synthetic_car_loan_data_quality_dataset() + with pytest.raises(InvalidArgumentsException, match=r".*['salary_range'].*"): + _ = NumericalRangeCalculator(column_names=continuous_column_names + ['salary_range']).fit(reference) + + +def test_numerical_range_calculator_with_custom_params_should_not_fail(): # noqa: D103 + reference, analysis, _ = load_synthetic_car_loan_data_quality_dataset() + try: + calc = NumericalRangeCalculator( + column_names=continuous_column_names, + chunk_period='M', + timestamp_column_name='timestamp', + normalize=False, + ).fit(reference) + _ = calc.calculate(data=analysis) + except Exception: + pytest.fail() + + +def test_numerical_range_calculator_validates_column_names_list_elements(): # noqa: D103 + with pytest.raises(InvalidArgumentsException): + _ = NumericalRangeCalculator( + column_names=[ + 'car_value', + {'ab': 1}, + ], + timestamp_column_name='timestamp', + normalize=False, + ) + + +def test_numerical_range_calculator_fit_should_raise_invalid_args_exception_when_no_data_present(): # noqa: D103, F821 + calc = NumericalRangeCalculator( + column_names=continuous_column_names, + timestamp_column_name='timestamp', + normalize=False, + ) + with pytest.raises(InvalidArgumentsException): + _ = calc.fit(pd.DataFrame()) + + +def test_numerical_range_calculator_calculate_should_raise_invalid_args_exception_when_no_data_present(): # noqa: D103 + reference, _, _ = load_synthetic_car_loan_data_quality_dataset() + calc = NumericalRangeCalculator(column_names=continuous_column_names).fit(reference_data=reference) + with pytest.raises(InvalidArgumentsException): + _ = calc.calculate(pd.DataFrame()) + + +def test_numerical_range_calculator_fit_should_raise_invalid_args_exception_when_column_missing(): # noqa: D103 + reference, _, _ = load_synthetic_car_loan_data_quality_dataset() + calc = NumericalRangeCalculator(column_names=continuous_column_names) + with pytest.raises(InvalidArgumentsException): + _ = calc.fit(reference.drop('car_value', axis=1)) + + +def test_numerical_range_calculator_calculate_should_raise_invalid_args_exception_when_column_missing(): # noqa: D103 + reference, analysis, _ = load_synthetic_car_loan_data_quality_dataset() + calc = NumericalRangeCalculator(column_names=continuous_column_names).fit(reference) + with pytest.raises(InvalidArgumentsException): + _ = calc.calculate(analysis.drop('car_value', axis=1)) + + +@pytest.mark.parametrize( + 'normalize, expected_metric', [(True, 'out_of_range_values_rate'), (False, 'out_of_range_values_count')] +) +def test_metric_is_set_properly(normalize, expected_metric): # noqa: D103 + reference, analysis, _ = load_synthetic_car_loan_data_quality_dataset() + calc = NumericalRangeCalculator(column_names=continuous_column_names, normalize=normalize).fit(reference) + res = calc.calculate(analysis) + assert calc.data_quality_metric == expected_metric + assert res.data_quality_metric == expected_metric + + +def test_whether_result_data_dataframe_has_proper_columns(numerical_range_result): # noqa: D103 + cols = numerical_range_result.data.columns + assert len(cols) == 7 + 3 * 4 + assert ('chunk', 'key') in cols + assert ('chunk', 'chunk_index') in cols + assert ('chunk', 'start_index') in cols + assert ('chunk', 'start_date') in cols + assert ('chunk', 'end_index') in cols + assert ('chunk', 'end_date') in cols + assert ('chunk', 'period') in cols + assert ('car_value', 'value') in cols + assert ('car_value', 'upper_threshold') in cols + assert ('car_value', 'lower_threshold') in cols + assert ('car_value', 'alert') in cols + + +def test_results_filtering_column_as_str(numerical_range_result): # noqa: D103 + try: + numerical_range_result.filter(column_names='car_value') + except Exception: + pytest.fail() + + +def test_results_filtering_column_as_list(numerical_range_result): # noqa: D103 + try: + numerical_range_result.filter( + column_names=[ + 'car_value', + ] + ) + except Exception: + pytest.fail() + + +@pytest.mark.parametrize( + 'column_name, expected_values', + [ + ('car_value', [0] * 20), + ( + 'debt_to_income_ratio', + [0.0] * 16 + [0.0004, 0.0, 0.0, 0.0], + ), + ('loan_length', [0] * 20), + ], +) +def test_results_repaid_loan_on_prev_car_values(numerical_range_result, column_name, expected_values): # noqa: D103 + res = numerical_range_result.filter(column_names=column_name).to_df() + assert list(res[(column_name, 'value')]) == expected_values + + +def test_results_alerts(numerical_range_result): # noqa: D103 + res = numerical_range_result.filter(column_names='debt_to_income_ratio').to_df() + assert list(res[('debt_to_income_ratio', 'alert')]) == [False] * 16 + [True, False, False, False] From 57308076048524e8ddaf9213e512e233c4b219b5 Mon Sep 17 00:00:00 2001 From: Niels Nuyttens Date: Fri, 19 Jul 2024 14:34:37 +0200 Subject: [PATCH 21/21] Typing stuff... --- tests/data_quality/test_range.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/data_quality/test_range.py b/tests/data_quality/test_range.py index 835a45a2..7506c57d 100644 --- a/tests/data_quality/test_range.py +++ b/tests/data_quality/test_range.py @@ -8,7 +8,8 @@ import pandas as pd import pytest -from nannyml.data_quality.range import NumericalRangeCalculator, Result +from nannyml._typing import Result +from nannyml.data_quality.range import NumericalRangeCalculator from nannyml.datasets import load_synthetic_car_loan_data_quality_dataset from nannyml.exceptions import InvalidArgumentsException