Skip to content

Commit

Permalink
dont allow confidence range to go above/below threshold limits
Browse files Browse the repository at this point in the history
  • Loading branch information
nikml committed Feb 8, 2024
1 parent fc6d253 commit 2b4655a
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 11 deletions.
17 changes: 13 additions & 4 deletions nannyml/data_quality/missing/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ def __init__(
Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given.
chunker : Chunker
The `Chunker` used to split the data sets into a lists of chunks.
threshold: Threshold, default=StandardDeviationThreshold
The threshold you wish to evaluate values on. Defaults to a StandardDeviationThreshold with default
options. The other available value is ConstantThreshold.
Examples
Expand Down Expand Up @@ -102,14 +105,14 @@ def __init__(
self._lower_alert_thresholds: Dict[str, Optional[float]] = {column_name: 0 for column_name in self.column_names}

self.lower_threshold_value_limit: float = 0
self.upper_threshold_value_limit: float
self.upper_threshold_value_limit: Optional[float] = None
self.normalize = normalize
if self.normalize:
self.data_quality_metric = 'missing_values_rate'
self.upper_threshold_value_limit = 1
else:
self.data_quality_metric = 'missing_values_count'
self.upper_threshold_value_limit = np.nan
# self.upper_threshold_value_limit = np.nan

def _calculate_missing_value_stats(self, data: pd.Series):
count_tot = data.shape[0]
Expand Down Expand Up @@ -217,8 +220,14 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st
else:
result['sampling_error'] = serr * np.sqrt(tot)

result['upper_confidence_boundary'] = result['value'] + SAMPLING_ERROR_RANGE * result['sampling_error']
result['lower_confidence_boundary'] = result['value'] - SAMPLING_ERROR_RANGE * result['sampling_error']
result['upper_confidence_boundary'] = np.minimum(
result['value'] + SAMPLING_ERROR_RANGE * result['sampling_error'],
self.upper_threshold_value_limit or np.inf
)
result['lower_confidence_boundary'] = np.maximum(
result['value'] - SAMPLING_ERROR_RANGE * result['sampling_error'],
self.lower_threshold_value_limit or -np.inf
)

result['upper_threshold'] = self._upper_alert_thresholds[column_name]
result['lower_threshold'] = self._lower_alert_thresholds[column_name]
Expand Down
14 changes: 7 additions & 7 deletions nannyml/data_quality/missing/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@


class Result(PerColumnResult, ResultCompareMixin):
"""Contains the results of the univariate statistical drift calculation and provides plotting functionality."""
"""Missing Values Result Class.
Contains calculation results and provides plotting functionality.
"""

def __init__(
self,
Expand All @@ -34,13 +37,14 @@ def __init__(
timestamp_column_name: Optional[str],
chunker: Chunker,
):
"""Initialize Missing Values Result Class."""
super().__init__(results_data, column_names)

self.timestamp_column_name = timestamp_column_name
self.data_quality_metric = data_quality_metric
self.chunker = chunker

def keys(self) -> List[Key]:
def keys(self) -> List[Key]: # noqa: D102
return [
Key(
properties=(column_name,),
Expand All @@ -55,10 +59,7 @@ def plot(
*args,
**kwargs,
) -> go.Figure:
"""
Parameters
----------
"""Plot Missing Values results.
Returns
-------
Expand All @@ -82,7 +83,6 @@ def plot(
... res = res.filter(period='analysis', column_name=column_name).plot().show()
"""

return plot_metrics(
self,
title='Data Quality ',
Expand Down

0 comments on commit 2b4655a

Please sign in to comment.