Skip to content

Commit 39a0a64

Browse files
committed
[skip ci] Updated docstrings for calculator/estimator init
1 parent 4cc253f commit 39a0a64

File tree

6 files changed

+122
-134
lines changed

6 files changed

+122
-134
lines changed

nannyml/drift/model_inputs/multivariate/data_reconstruction/calculator.py

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -66,30 +66,27 @@ def __init__(
6666
Examples
6767
--------
6868
>>> import nannyml as nml
69-
>>> reference_df, analysis_df, _ = nml.load_synthetic_binary_classification_dataset()
70-
>>>
71-
>>> feature_column_names = [col for col in reference_df.columns
72-
>>> if col not in ['y_pred', 'y_pred_proba', 'work_home_actual', 'timestamp']]
69+
>>> from IPython.display import display
70+
>>> # Load synthetic data
71+
>>> reference = nml.load_synthetic_binary_classification_dataset()[0]
72+
>>> analysis = nml.load_synthetic_binary_classification_dataset()[1]
73+
>>> display(reference.head())
74+
>>> # Define feature columns
75+
>>> feature_column_names = [
76+
... col for col in reference.columns if col not in [
77+
... 'timestamp', 'y_pred_proba', 'period', 'y_pred', 'work_home_actual', 'identifier'
78+
... ]]
7379
>>> calc = nml.DataReconstructionDriftCalculator(
74-
>>> feature_column_names=feature_column_names,
75-
>>> timestamp_column_name='timestamp'
80+
... feature_column_names=feature_column_names,
81+
... timestamp_column_name='timestamp',
82+
... chunk_size=5000
7683
>>> )
77-
>>> calc.fit(reference_df)
78-
>>> results = calc.calculate(analysis_df)
79-
>>> print(results.data) # access the numbers
80-
key start_index ... upper_threshold alert
81-
0 [0:4999] 0 ... 1.511762 True
82-
1 [5000:9999] 5000 ... 1.511762 True
83-
2 [10000:14999] 10000 ... 1.511762 True
84-
3 [15000:19999] 15000 ... 1.511762 True
85-
4 [20000:24999] 20000 ... 1.511762 True
86-
5 [25000:29999] 25000 ... 1.511762 True
87-
6 [30000:34999] 30000 ... 1.511762 True
88-
7 [35000:39999] 35000 ... 1.511762 True
89-
8 [40000:44999] 40000 ... 1.511762 True
90-
9 [45000:49999] 45000 ... 1.511762 True
91-
>>> fig = results.plot(kind='drift', plot_reference=True)
92-
>>> fig.show()
84+
>>> calc.fit(reference)
85+
>>> results = calc.calculate(analysis)
86+
>>> display(results.data)
87+
>>> display(results.calculator.previous_reference_results)
88+
>>> figure = results.plot(plot_reference=True)
89+
>>> figure.show()
9390
"""
9491
super(DataReconstructionDriftCalculator, self).__init__(
9592
chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name

nannyml/drift/model_inputs/univariate/statistical/calculator.py

Lines changed: 35 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -56,31 +56,45 @@ def __init__(
5656
Examples
5757
--------
5858
>>> import nannyml as nml
59-
>>>
60-
>>> reference_df, analysis_df, _ = nml.load_synthetic_binary_classification_dataset()
61-
>>>
62-
>>> feature_column_names = [col for col in reference_df.columns
63-
>>> if col not in ['y_pred', 'y_pred_proba', 'work_home_actual', 'timestamp']]
59+
>>> from IPython.display import display
60+
>>> reference_df = nml.load_synthetic_binary_classification_dataset()[0]
61+
>>> analysis_df = nml.load_synthetic_binary_classification_dataset()[1]
62+
>>> display(reference_df.head())
63+
>>> feature_column_names = [
64+
... col for col in reference_df.columns if col not in [
65+
... 'timestamp', 'y_pred_proba', 'period', 'y_pred', 'work_home_actual', 'identifier'
66+
>>> ]]
6467
>>> calc = nml.UnivariateStatisticalDriftCalculator(
65-
>>> feature_column_names=feature_column_names,
66-
>>> timestamp_column_name='timestamp'
68+
... feature_column_names=feature_column_names,
69+
... timestamp_column_name='timestamp'
6770
>>> )
6871
>>> calc.fit(reference_df)
6972
>>> results = calc.calculate(analysis_df)
70-
>>> print(results.data) # check the numbers
71-
key start_index ... identifier_alert identifier_threshold
72-
0 [0:4999] 0 ... True 0.05
73-
1 [5000:9999] 5000 ... True 0.05
74-
2 [10000:14999] 10000 ... True 0.05
75-
3 [15000:19999] 15000 ... True 0.05
76-
4 [20000:24999] 20000 ... True 0.05
77-
5 [25000:29999] 25000 ... True 0.05
78-
6 [30000:34999] 30000 ... True 0.05
79-
7 [35000:39999] 35000 ... True 0.05
80-
8 [40000:44999] 40000 ... True 0.05
81-
9 [45000:49999] 45000 ... True 0.05
82-
>>> fig = results.plot(kind='feature_drift', plot_reference=True, feature_column_name='distance_from_office')
83-
>>> fig.show()
73+
>>> display(results.data.iloc[:, :9])
74+
>>> display(calc.previous_reference_results.iloc[:, :9])
75+
>>> for feature in calc.feature_column_names:
76+
... drift_fig = results.plot(
77+
... kind='feature_drift',
78+
... feature_column_name=feature,
79+
... plot_reference=True
80+
... )
81+
... drift_fig.show()
82+
>>> for cont_feat in calc.continuous_column_names:
83+
... figure = results.plot(
84+
... kind='feature_distribution',
85+
... feature_column_name=cont_feat,
86+
... plot_reference=True
87+
... )
88+
... figure.show()
89+
>>> for cat_feat in calc.categorical_column_names:
90+
... figure = results.plot(
91+
... kind='feature_distribution',
92+
... feature_column_name=cat_feat,
93+
... plot_reference=True)
94+
... figure.show()
95+
>>> ranker = nml.Ranker.by('alert_count')
96+
>>> ranked_features = ranker.rank(results, only_drifting = False)
97+
>>> display(ranked_features)
8498
"""
8599
super(UnivariateStatisticalDriftCalculator, self).__init__(
86100
chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name

nannyml/drift/model_outputs/univariate/statistical/calculator.py

Lines changed: 17 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -61,34 +61,27 @@ def __init__(
6161
Examples
6262
--------
6363
>>> import nannyml as nml
64-
>>>
65-
>>> reference_df, analysis_df, _ = nml.load_synthetic_binary_classification_dataset()
66-
>>>
64+
>>> from IPython.display import display
65+
>>> reference_df = nml.load_synthetic_binary_classification_dataset()[0]
66+
>>> analysis_df = nml.load_synthetic_binary_classification_dataset()[1]
67+
>>> display(reference_df.head())
6768
>>> calc = nml.StatisticalOutputDriftCalculator(
68-
>>> y_pred_proba='y_pred_proba',
69-
>>> y_pred='y_pred',
70-
>>> timestamp_column_name='timestamp'
69+
... y_pred='y_pred',
70+
... y_pred_proba='y_pred_proba',
71+
... timestamp_column_name='timestamp',
72+
... problem_type='classification_binary'
7173
>>> )
7274
>>> calc.fit(reference_df)
7375
>>> results = calc.calculate(analysis_df)
74-
>>>
75-
>>> print(results.data) # check the numbers
76-
key start_index ... y_pred_proba_alert y_pred_proba_threshold
77-
0 [0:4999] 0 ... True 0.05
78-
1 [5000:9999] 5000 ... False 0.05
79-
2 [10000:14999] 10000 ... False 0.05
80-
3 [15000:19999] 15000 ... False 0.05
81-
4 [20000:24999] 20000 ... False 0.05
82-
5 [25000:29999] 25000 ... True 0.05
83-
6 [30000:34999] 30000 ... True 0.05
84-
7 [35000:39999] 35000 ... True 0.05
85-
8 [40000:44999] 40000 ... True 0.05
86-
9 [45000:49999] 45000 ... True 0.05
87-
>>>
88-
>>> results.plot(kind='score_drift', metric='p_value', plot_reference=True).show()
89-
>>> results.plot(kind='score_distribution', plot_reference=True).show()
90-
>>> results.plot(kind='prediction_drift', plot_reference=True).show()
91-
>>> results.plot(kind='prediction_distribution', plot_reference=True).show()
76+
>>> display(results.data)
77+
>>> score_drift_fig = results.plot(kind='score_drift', plot_reference=True)
78+
>>> score_drift_fig.show()
79+
>>> score_distribution_fig = results.plot(kind='score_distribution', plot_reference=True)
80+
>>> score_distribution_fig.show()
81+
>>> prediction_drift_fig = results.plot(kind='prediction_drift', plot_reference=True)
82+
>>> prediction_drift_fig.show()
83+
>>> prediction_distribution_fig = results.plot(kind='prediction_distribution', plot_reference=True)
84+
>>> prediction_distribution_fig.show()
9285
"""
9386
super(StatisticalOutputDriftCalculator, self).__init__(
9487
chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name

nannyml/drift/target/target_distribution/calculator.py

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -58,30 +58,24 @@ def __init__(
5858
Examples
5959
--------
6060
>>> import nannyml as nml
61-
>>>
62-
>>> reference_df, analysis_df, target_df = nml.load_synthetic_binary_classification_dataset()
63-
>>>
61+
>>> from IPython.display import display
62+
>>> reference_df = nml.load_synthetic_binary_classification_dataset()[0]
63+
>>> analysis_df = nml.load_synthetic_binary_classification_dataset()[1]
64+
>>> analysis_target_df = nml.load_synthetic_binary_classification_dataset()[2]
65+
>>> analysis_df = analysis_df.merge(analysis_target_df, on='identifier')
66+
>>> display(reference_df.head(3))
6467
>>> calc = nml.TargetDistributionCalculator(
65-
>>> y_true='work_home_actual',
66-
>>> timestamp_column_name='timestamp'
68+
... y_true='work_home_actual',
69+
... timestamp_column_name='timestamp',
70+
... problem_type='classification_binary'
6771
>>> )
6872
>>> calc.fit(reference_df)
69-
>>> results = calc.calculate(analysis_df.merge(target_df, on='identifier'))
70-
>>> print(results.data) # check the numbers
71-
key start_index end_index ... thresholds alert significant
72-
0 [0:4999] 0 4999 ... 0.05 True True
73-
1 [5000:9999] 5000 9999 ... 0.05 False False
74-
2 [10000:14999] 10000 14999 ... 0.05 False False
75-
3 [15000:19999] 15000 19999 ... 0.05 False False
76-
4 [20000:24999] 20000 24999 ... 0.05 False False
77-
5 [25000:29999] 25000 29999 ... 0.05 False False
78-
6 [30000:34999] 30000 34999 ... 0.05 False False
79-
7 [35000:39999] 35000 39999 ... 0.05 False False
80-
8 [40000:44999] 40000 44999 ... 0.05 False False
81-
9 [45000:49999] 45000 49999 ... 0.05 False False
82-
>>>
83-
>>> results.plot(kind='target_drift', plot_reference=True).show()
84-
>>> results.plot(kind='target_distribution', plot_reference=True).show()
73+
>>> results = calc.calculate(analysis_df)
74+
>>> display(results.data.head(3))
75+
>>> target_drift_fig = results.plot(kind='target_drift', plot_reference=True)
76+
>>> target_drift_fig.show()
77+
>>> target_distribution_fig = results.plot(kind='target_distribution', plot_reference=True)
78+
>>> target_distribution_fig.show()
8579
"""
8680
super().__init__(chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name)
8781

nannyml/performance_calculation/calculator.py

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -71,29 +71,27 @@ def __init__(
7171
Examples
7272
--------
7373
>>> import nannyml as nml
74-
>>>
75-
>>> reference_df, analysis_df, target_df = nml.load_synthetic_binary_classification_dataset()
76-
>>>
77-
>>> calc = nml.PerformanceCalculator(y_true='work_home_actual', y_pred='y_pred', y_pred_proba='y_pred_proba',
78-
>>> timestamp_column_name='timestamp', metrics=['f1', 'roc_auc'])
79-
>>>
74+
>>> from IPython.display import display
75+
>>> reference_df = nml.load_synthetic_binary_classification_dataset()[0]
76+
>>> analysis_df = nml.load_synthetic_binary_classification_dataset()[1]
77+
>>> analysis_target_df = nml.load_synthetic_binary_classification_dataset()[2]
78+
>>> analysis_df = analysis_df.merge(analysis_target_df, on='identifier')
79+
>>> display(reference_df.head(3))
80+
>>> calc = nml.PerformanceCalculator(
81+
... y_pred_proba='y_pred_proba',
82+
... y_pred='y_pred',
83+
... y_true='work_home_actual',
84+
... timestamp_column_name='timestamp',
85+
... problem_type='classification_binary',
86+
... metrics=['roc_auc', 'f1', 'precision', 'recall', 'specificity', 'accuracy'],
87+
... chunk_size=5000)
8088
>>> calc.fit(reference_df)
81-
>>>
82-
>>> results = calc.calculate(analysis_df.merge(target_df, on='identifier'))
83-
>>> print(results.data)
84-
key start_index ... roc_auc_upper_threshold roc_auc_alert
85-
0 [0:4999] 0 ... 0.97866 False
86-
1 [5000:9999] 5000 ... 0.97866 False
87-
2 [10000:14999] 10000 ... 0.97866 False
88-
3 [15000:19999] 15000 ... 0.97866 False
89-
4 [20000:24999] 20000 ... 0.97866 False
90-
5 [25000:29999] 25000 ... 0.97866 True
91-
6 [30000:34999] 30000 ... 0.97866 True
92-
7 [35000:39999] 35000 ... 0.97866 True
93-
8 [40000:44999] 40000 ... 0.97866 True
94-
9 [45000:49999] 45000 ... 0.97866 True
89+
>>> results = calc.calculate(analysis_df)
90+
>>> display(results.data)
91+
>>> display(results.calculator.previous_reference_results)
9592
>>> for metric in calc.metrics:
96-
>>> results.plot(metric=metric, plot_reference=True).show()
93+
... figure = results.plot(kind='performance', plot_reference=True, metric=metric)
94+
... figure.show()
9795
"""
9896
super().__init__(chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name)
9997

nannyml/performance_estimation/confidence_based/cbpe.py

Lines changed: 17 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -91,36 +91,28 @@ def __init__(
9191
Examples
9292
--------
9393
>>> import nannyml as nml
94-
>>>
95-
>>> reference_df, analysis_df, target_df = nml.load_synthetic_binary_classification_dataset()
96-
>>>
94+
>>> from IPython.display import display
95+
>>> reference_df = nml.load_synthetic_binary_classification_dataset()[0]
96+
>>> analysis_df = nml.load_synthetic_binary_classification_dataset()[1]
97+
>>> display(reference_df.head(3))
9798
>>> estimator = nml.CBPE(
98-
>>> y_true='work_home_actual',
99-
>>> y_pred='y_pred',
100-
>>> y_pred_proba='y_pred_proba',
101-
>>> timestamp_column_name='timestamp',
102-
>>> metrics=['f1', 'roc_auc'],
103-
>>> problem_type='classification_binary',
99+
... y_pred_proba='y_pred_proba',
100+
... y_pred='y_pred',
101+
... y_true='work_home_actual',
102+
... timestamp_column_name='timestamp',
103+
... metrics=['roc_auc', 'f1'],
104+
... chunk_size=5000,
105+
... problem_type='classification_binary',
104106
>>> )
105-
>>>
106107
>>> estimator.fit(reference_df)
107-
>>>
108108
>>> results = estimator.estimate(analysis_df)
109-
>>> print(results.data)
110-
key start_index ... lower_threshold_roc_auc alert_roc_auc
111-
0 [0:4999] 0 ... 0.97866 False
112-
1 [5000:9999] 5000 ... 0.97866 False
113-
2 [10000:14999] 10000 ... 0.97866 False
114-
3 [15000:19999] 15000 ... 0.97866 False
115-
4 [20000:24999] 20000 ... 0.97866 False
116-
5 [25000:29999] 25000 ... 0.97866 True
117-
6 [30000:34999] 30000 ... 0.97866 True
118-
7 [35000:39999] 35000 ... 0.97866 True
119-
8 [40000:44999] 40000 ... 0.97866 True
120-
9 [45000:49999] 45000 ... 0.97866 True
109+
>>> display(results.data)
121110
>>> for metric in estimator.metrics:
122-
>>> results.plot(metric=metric, plot_reference=True).show()
123-
111+
... metric_fig = results.plot(kind='performance', metric=metric)
112+
... metric_fig.show()
113+
>>> for metric in estimator.metrics:
114+
... metric_fig = results.plot(kind='performance', plot_reference=True, metric=metric)
115+
... metric_fig.show()
124116
"""
125117
super().__init__(chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name)
126118

0 commit comments

Comments
 (0)