[skip ci] Updated docstrings for calculator/estimator init

nnansters · nnansters · commit 39a0a649d593 · 2022-09-22T15:21:21.000+02:00
diff --git a/nannyml/drift/model_inputs/multivariate/data_reconstruction/calculator.py b/nannyml/drift/model_inputs/multivariate/data_reconstruction/calculator.py
@@ -66,30 +66,27 @@ def __init__(
         Examples
         --------
         >>> import nannyml as nml
-        >>> reference_df, analysis_df, _ = nml.load_synthetic_binary_classification_dataset()
-        >>>
-        >>> feature_column_names = [col for col in reference_df.columns
-        >>>                         if col not in ['y_pred', 'y_pred_proba', 'work_home_actual', 'timestamp']]
+        >>> from IPython.display import display
+        >>> # Load synthetic data
+        >>> reference = nml.load_synthetic_binary_classification_dataset()[0]
+        >>> analysis = nml.load_synthetic_binary_classification_dataset()[1]
+        >>> display(reference.head())
+        >>> # Define feature columns
+        >>> feature_column_names = [
+        ...     col for col in reference.columns if col not in [
+        ...         'timestamp', 'y_pred_proba', 'period', 'y_pred', 'work_home_actual', 'identifier'
+        ...     ]]
         >>> calc = nml.DataReconstructionDriftCalculator(
-        >>>     feature_column_names=feature_column_names,
-        >>>     timestamp_column_name='timestamp'
+        ...     feature_column_names=feature_column_names,
+        ...     timestamp_column_name='timestamp',
+        ...     chunk_size=5000
         >>> )
-        >>> calc.fit(reference_df)
-        >>> results = calc.calculate(analysis_df)
-        >>> print(results.data)  # access the numbers
-                             key  start_index  ...  upper_threshold alert
-        0       [0:4999]            0  ...         1.511762  True
-        1    [5000:9999]         5000  ...         1.511762  True
-        2  [10000:14999]        10000  ...         1.511762  True
-        3  [15000:19999]        15000  ...         1.511762  True
-        4  [20000:24999]        20000  ...         1.511762  True
-        5  [25000:29999]        25000  ...         1.511762  True
-        6  [30000:34999]        30000  ...         1.511762  True
-        7  [35000:39999]        35000  ...         1.511762  True
-        8  [40000:44999]        40000  ...         1.511762  True
-        9  [45000:49999]        45000  ...         1.511762  True
-        >>> fig = results.plot(kind='drift', plot_reference=True)
-        >>> fig.show()
+        >>> calc.fit(reference)
+        >>> results = calc.calculate(analysis)
+        >>> display(results.data)
+        >>> display(results.calculator.previous_reference_results)
+        >>> figure = results.plot(plot_reference=True)
+        >>> figure.show()
         """
         super(DataReconstructionDriftCalculator, self).__init__(
             chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name
diff --git a/nannyml/drift/model_inputs/univariate/statistical/calculator.py b/nannyml/drift/model_inputs/univariate/statistical/calculator.py
@@ -56,31 +56,45 @@ def __init__(
         Examples
         --------
         >>> import nannyml as nml
-        >>>
-        >>> reference_df, analysis_df, _ = nml.load_synthetic_binary_classification_dataset()
-        >>>
-        >>> feature_column_names = [col for col in reference_df.columns
-        >>>                         if col not in ['y_pred', 'y_pred_proba', 'work_home_actual', 'timestamp']]
+        >>> from IPython.display import display
+        >>> reference_df = nml.load_synthetic_binary_classification_dataset()[0]
+        >>> analysis_df = nml.load_synthetic_binary_classification_dataset()[1]
+        >>> display(reference_df.head())
+        >>> feature_column_names = [
+        ...     col for col in reference_df.columns if col not in [
+        ...     'timestamp', 'y_pred_proba', 'period', 'y_pred', 'work_home_actual', 'identifier'
+        >>> ]]
         >>> calc = nml.UnivariateStatisticalDriftCalculator(
-        >>>     feature_column_names=feature_column_names,
-        >>>     timestamp_column_name='timestamp'
+        ...     feature_column_names=feature_column_names,
+        ...     timestamp_column_name='timestamp'
         >>> )
         >>> calc.fit(reference_df)
         >>> results = calc.calculate(analysis_df)
-        >>> print(results.data)  # check the numbers
-                     key  start_index  ...  identifier_alert identifier_threshold
-        0       [0:4999]            0  ...              True                 0.05
-        1    [5000:9999]         5000  ...              True                 0.05
-        2  [10000:14999]        10000  ...              True                 0.05
-        3  [15000:19999]        15000  ...              True                 0.05
-        4  [20000:24999]        20000  ...              True                 0.05
-        5  [25000:29999]        25000  ...              True                 0.05
-        6  [30000:34999]        30000  ...              True                 0.05
-        7  [35000:39999]        35000  ...              True                 0.05
-        8  [40000:44999]        40000  ...              True                 0.05
-        9  [45000:49999]        45000  ...              True                 0.05
-        >>> fig = results.plot(kind='feature_drift', plot_reference=True, feature_column_name='distance_from_office')
-        >>> fig.show()
+        >>> display(results.data.iloc[:, :9])
+        >>> display(calc.previous_reference_results.iloc[:, :9])
+        >>> for feature in calc.feature_column_names:
+        ...     drift_fig = results.plot(
+        ...         kind='feature_drift',
+        ...         feature_column_name=feature,
+        ...         plot_reference=True
+        ...     )
+        ...     drift_fig.show()
+        >>> for cont_feat in calc.continuous_column_names:
+        ...     figure = results.plot(
+        ...         kind='feature_distribution',
+        ...         feature_column_name=cont_feat,
+        ...         plot_reference=True
+        ...     )
+        ...     figure.show()
+        >>> for cat_feat in calc.categorical_column_names:
+        ...     figure = results.plot(
+        ...         kind='feature_distribution',
+        ...         feature_column_name=cat_feat,
+        ...         plot_reference=True)
+        ...     figure.show()
+        >>> ranker = nml.Ranker.by('alert_count')
+        >>> ranked_features = ranker.rank(results, only_drifting = False)
+        >>> display(ranked_features)
         """
         super(UnivariateStatisticalDriftCalculator, self).__init__(
             chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name
diff --git a/nannyml/drift/model_outputs/univariate/statistical/calculator.py b/nannyml/drift/model_outputs/univariate/statistical/calculator.py
@@ -61,34 +61,27 @@ def __init__(
         Examples
         --------
         >>> import nannyml as nml
-        >>>
-        >>> reference_df, analysis_df, _ = nml.load_synthetic_binary_classification_dataset()
-        >>>
+        >>> from IPython.display import display
+        >>> reference_df = nml.load_synthetic_binary_classification_dataset()[0]
+        >>> analysis_df = nml.load_synthetic_binary_classification_dataset()[1]
+        >>> display(reference_df.head())
         >>> calc = nml.StatisticalOutputDriftCalculator(
-        >>>     y_pred_proba='y_pred_proba',
-        >>>     y_pred='y_pred',
-        >>>     timestamp_column_name='timestamp'
+        ...     y_pred='y_pred',
+        ...     y_pred_proba='y_pred_proba',
+        ...     timestamp_column_name='timestamp',
+        ...     problem_type='classification_binary'
         >>> )
         >>> calc.fit(reference_df)
         >>> results = calc.calculate(analysis_df)
-        >>>
-        >>> print(results.data)  # check the numbers
-                     key  start_index  ...  y_pred_proba_alert y_pred_proba_threshold
-        0       [0:4999]            0  ...                True                   0.05
-        1    [5000:9999]         5000  ...               False                   0.05
-        2  [10000:14999]        10000  ...               False                   0.05
-        3  [15000:19999]        15000  ...               False                   0.05
-        4  [20000:24999]        20000  ...               False                   0.05
-        5  [25000:29999]        25000  ...                True                   0.05
-        6  [30000:34999]        30000  ...                True                   0.05
-        7  [35000:39999]        35000  ...                True                   0.05
-        8  [40000:44999]        40000  ...                True                   0.05
-        9  [45000:49999]        45000  ...                True                   0.05
-        >>>
-        >>> results.plot(kind='score_drift', metric='p_value', plot_reference=True).show()
-        >>> results.plot(kind='score_distribution', plot_reference=True).show()
-        >>> results.plot(kind='prediction_drift', plot_reference=True).show()
-        >>> results.plot(kind='prediction_distribution', plot_reference=True).show()
+        >>> display(results.data)
+        >>> score_drift_fig = results.plot(kind='score_drift', plot_reference=True)
+        >>> score_drift_fig.show()
+        >>> score_distribution_fig = results.plot(kind='score_distribution', plot_reference=True)
+        >>> score_distribution_fig.show()
+        >>> prediction_drift_fig = results.plot(kind='prediction_drift', plot_reference=True)
+        >>> prediction_drift_fig.show()
+        >>> prediction_distribution_fig = results.plot(kind='prediction_distribution', plot_reference=True)
+        >>> prediction_distribution_fig.show()
         """
         super(StatisticalOutputDriftCalculator, self).__init__(
             chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name
diff --git a/nannyml/drift/target/target_distribution/calculator.py b/nannyml/drift/target/target_distribution/calculator.py
@@ -58,30 +58,24 @@ def __init__(
         Examples
         --------
         >>> import nannyml as nml
-        >>>
-        >>> reference_df, analysis_df, target_df = nml.load_synthetic_binary_classification_dataset()
-        >>>
+        >>> from IPython.display import display
+        >>> reference_df = nml.load_synthetic_binary_classification_dataset()[0]
+        >>> analysis_df = nml.load_synthetic_binary_classification_dataset()[1]
+        >>> analysis_target_df = nml.load_synthetic_binary_classification_dataset()[2]
+        >>> analysis_df = analysis_df.merge(analysis_target_df, on='identifier')
+        >>> display(reference_df.head(3))
         >>> calc = nml.TargetDistributionCalculator(
-        >>>     y_true='work_home_actual',
-        >>>     timestamp_column_name='timestamp'
+        ...     y_true='work_home_actual',
+        ...     timestamp_column_name='timestamp',
+        ...     problem_type='classification_binary'
         >>> )
         >>> calc.fit(reference_df)
-        >>> results = calc.calculate(analysis_df.merge(target_df, on='identifier'))
-        >>> print(results.data)  # check the numbers
-                     key  start_index  end_index  ... thresholds  alert significant
-        0       [0:4999]            0       4999  ...       0.05   True        True
-        1    [5000:9999]         5000       9999  ...       0.05  False       False
-        2  [10000:14999]        10000      14999  ...       0.05  False       False
-        3  [15000:19999]        15000      19999  ...       0.05  False       False
-        4  [20000:24999]        20000      24999  ...       0.05  False       False
-        5  [25000:29999]        25000      29999  ...       0.05  False       False
-        6  [30000:34999]        30000      34999  ...       0.05  False       False
-        7  [35000:39999]        35000      39999  ...       0.05  False       False
-        8  [40000:44999]        40000      44999  ...       0.05  False       False
-        9  [45000:49999]        45000      49999  ...       0.05  False       False
-        >>>
-        >>> results.plot(kind='target_drift', plot_reference=True).show()
-        >>> results.plot(kind='target_distribution', plot_reference=True).show()
+        >>> results = calc.calculate(analysis_df)
+        >>> display(results.data.head(3))
+        >>> target_drift_fig = results.plot(kind='target_drift', plot_reference=True)
+        >>> target_drift_fig.show()
+        >>> target_distribution_fig = results.plot(kind='target_distribution', plot_reference=True)
+        >>> target_distribution_fig.show()
         """
         super().__init__(chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name)
 
diff --git a/nannyml/performance_calculation/calculator.py b/nannyml/performance_calculation/calculator.py
@@ -71,29 +71,27 @@ def __init__(
         Examples
         --------
         >>> import nannyml as nml
-        >>>
-        >>> reference_df, analysis_df, target_df = nml.load_synthetic_binary_classification_dataset()
-        >>>
-        >>> calc = nml.PerformanceCalculator(y_true='work_home_actual', y_pred='y_pred', y_pred_proba='y_pred_proba',
-        >>>                                  timestamp_column_name='timestamp', metrics=['f1', 'roc_auc'])
-        >>>
+        >>> from IPython.display import display
+        >>> reference_df = nml.load_synthetic_binary_classification_dataset()[0]
+        >>> analysis_df = nml.load_synthetic_binary_classification_dataset()[1]
+        >>> analysis_target_df = nml.load_synthetic_binary_classification_dataset()[2]
+        >>> analysis_df = analysis_df.merge(analysis_target_df, on='identifier')
+        >>> display(reference_df.head(3))
+        >>> calc = nml.PerformanceCalculator(
+        ...     y_pred_proba='y_pred_proba',
+        ...     y_pred='y_pred',
+        ...     y_true='work_home_actual',
+        ...     timestamp_column_name='timestamp',
+        ...     problem_type='classification_binary',
+        ...     metrics=['roc_auc', 'f1', 'precision', 'recall', 'specificity', 'accuracy'],
+        ...     chunk_size=5000)
         >>> calc.fit(reference_df)
-        >>>
-        >>> results = calc.calculate(analysis_df.merge(target_df, on='identifier'))
-        >>> print(results.data)
-                     key  start_index  ...  roc_auc_upper_threshold roc_auc_alert
-        0       [0:4999]            0  ...                  0.97866         False
-        1    [5000:9999]         5000  ...                  0.97866         False
-        2  [10000:14999]        10000  ...                  0.97866         False
-        3  [15000:19999]        15000  ...                  0.97866         False
-        4  [20000:24999]        20000  ...                  0.97866         False
-        5  [25000:29999]        25000  ...                  0.97866          True
-        6  [30000:34999]        30000  ...                  0.97866          True
-        7  [35000:39999]        35000  ...                  0.97866          True
-        8  [40000:44999]        40000  ...                  0.97866          True
-        9  [45000:49999]        45000  ...                  0.97866          True
+        >>> results = calc.calculate(analysis_df)
+        >>> display(results.data)
+        >>> display(results.calculator.previous_reference_results)
         >>> for metric in calc.metrics:
-        >>>     results.plot(metric=metric, plot_reference=True).show()
+        ...     figure = results.plot(kind='performance', plot_reference=True, metric=metric)
+        ...     figure.show()
         """
         super().__init__(chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name)
 
diff --git a/nannyml/performance_estimation/confidence_based/cbpe.py b/nannyml/performance_estimation/confidence_based/cbpe.py
@@ -91,36 +91,28 @@ def __init__(
         Examples
         --------
         >>> import nannyml as nml
-        >>>
-        >>> reference_df, analysis_df, target_df = nml.load_synthetic_binary_classification_dataset()
-        >>>
+        >>> from IPython.display import display
+        >>> reference_df = nml.load_synthetic_binary_classification_dataset()[0]
+        >>> analysis_df = nml.load_synthetic_binary_classification_dataset()[1]
+        >>> display(reference_df.head(3))
         >>> estimator = nml.CBPE(
-        >>>     y_true='work_home_actual',
-        >>>     y_pred='y_pred',
-        >>>     y_pred_proba='y_pred_proba',
-        >>>     timestamp_column_name='timestamp',
-        >>>     metrics=['f1', 'roc_auc'],
-        >>>     problem_type='classification_binary',
+        ...     y_pred_proba='y_pred_proba',
+        ...     y_pred='y_pred',
+        ...     y_true='work_home_actual',
+        ...     timestamp_column_name='timestamp',
+        ...     metrics=['roc_auc', 'f1'],
+        ...     chunk_size=5000,
+        ...     problem_type='classification_binary',
         >>> )
-        >>>
         >>> estimator.fit(reference_df)
-        >>>
         >>> results = estimator.estimate(analysis_df)
-        >>> print(results.data)
-                     key  start_index  ...  lower_threshold_roc_auc alert_roc_auc
-        0       [0:4999]            0  ...                  0.97866         False
-        1    [5000:9999]         5000  ...                  0.97866         False
-        2  [10000:14999]        10000  ...                  0.97866         False
-        3  [15000:19999]        15000  ...                  0.97866         False
-        4  [20000:24999]        20000  ...                  0.97866         False
-        5  [25000:29999]        25000  ...                  0.97866          True
-        6  [30000:34999]        30000  ...                  0.97866          True
-        7  [35000:39999]        35000  ...                  0.97866          True
-        8  [40000:44999]        40000  ...                  0.97866          True
-        9  [45000:49999]        45000  ...                  0.97866          True
+        >>> display(results.data)
         >>> for metric in estimator.metrics:
-        >>>     results.plot(metric=metric, plot_reference=True).show()
-
+        ...     metric_fig = results.plot(kind='performance', metric=metric)
+        ...     metric_fig.show()
+        >>> for metric in estimator.metrics:
+        ...     metric_fig = results.plot(kind='performance', plot_reference=True, metric=metric)
+        ...     metric_fig.show()
         """
         super().__init__(chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name)