dont allow confidence range to go above/below threshold limits

NannyML · Feb 8, 2024 · 2b4655a · 2b4655a
1 parent fc6d253
commit 2b4655a
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 11 deletions.
diff --git a/nannyml/data_quality/missing/calculator.py b/nannyml/data_quality/missing/calculator.py
@@ -62,6 +62,9 @@ def __init__(
             Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given.
         chunker : Chunker
             The `Chunker` used to split the data sets into a lists of chunks.
+        threshold: Threshold, default=StandardDeviationThreshold
+            The threshold you wish to evaluate values on. Defaults to a StandardDeviationThreshold with default
+            options. The other available value is ConstantThreshold.
 
 
         Examples
@@ -102,14 +105,14 @@ def __init__(
         self._lower_alert_thresholds: Dict[str, Optional[float]] = {column_name: 0 for column_name in self.column_names}
 
         self.lower_threshold_value_limit: float = 0
-        self.upper_threshold_value_limit: float
+        self.upper_threshold_value_limit: Optional[float] = None
         self.normalize = normalize
         if self.normalize:
             self.data_quality_metric = 'missing_values_rate'
             self.upper_threshold_value_limit = 1
         else:
             self.data_quality_metric = 'missing_values_count'
-            self.upper_threshold_value_limit = np.nan
+            # self.upper_threshold_value_limit = np.nan
 
     def _calculate_missing_value_stats(self, data: pd.Series):
         count_tot = data.shape[0]
@@ -217,8 +220,14 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st
         else:
             result['sampling_error'] = serr * np.sqrt(tot)
 
-        result['upper_confidence_boundary'] = result['value'] + SAMPLING_ERROR_RANGE * result['sampling_error']
-        result['lower_confidence_boundary'] = result['value'] - SAMPLING_ERROR_RANGE * result['sampling_error']
+        result['upper_confidence_boundary'] = np.minimum(
+            result['value'] + SAMPLING_ERROR_RANGE * result['sampling_error'],
+            self.upper_threshold_value_limit or np.inf
+        )
+        result['lower_confidence_boundary'] = np.maximum(
+            result['value'] - SAMPLING_ERROR_RANGE * result['sampling_error'],
+            self.lower_threshold_value_limit or -np.inf
+        )
 
         result['upper_threshold'] = self._upper_alert_thresholds[column_name]
         result['lower_threshold'] = self._lower_alert_thresholds[column_name]

diff --git a/nannyml/data_quality/missing/result.py b/nannyml/data_quality/missing/result.py
@@ -24,7 +24,10 @@
 
 
 class Result(PerColumnResult, ResultCompareMixin):
-    """Contains the results of the univariate statistical drift calculation and provides plotting functionality."""
+    """Missing Values Result Class.
+
+    Contains calculation results and provides plotting functionality.
+    """
 
     def __init__(
         self,
@@ -34,13 +37,14 @@ def __init__(
         timestamp_column_name: Optional[str],
         chunker: Chunker,
     ):
+        """Initialize Missing Values Result Class."""
         super().__init__(results_data, column_names)
 
         self.timestamp_column_name = timestamp_column_name
         self.data_quality_metric = data_quality_metric
         self.chunker = chunker
 
-    def keys(self) -> List[Key]:
+    def keys(self) -> List[Key]:  # noqa: D102
         return [
             Key(
                 properties=(column_name,),
@@ -55,10 +59,7 @@ def plot(
         *args,
         **kwargs,
     ) -> go.Figure:
-        """
-
-        Parameters
-        ----------
+        """Plot Missing Values results.
 
         Returns
         -------
@@ -82,7 +83,6 @@ def plot(
         ...     res = res.filter(period='analysis', column_name=column_name).plot().show()
 
         """
-
         return plot_metrics(
             self,
             title='Data Quality ',