From be87811e1c68d2305f1e004daa8c79d271a35fb2 Mon Sep 17 00:00:00 2001 From: Nikolaos Perrakis <89025229+nikml@users.noreply.github.com> Date: Tue, 27 Feb 2024 16:14:45 +0200 Subject: [PATCH] Rename Classifer for Drift Detection to Domain Classifer (#368) * fix multiv-why placement * renaming CDD to DC * small fixes * update DC plot title --- .../butterfly-multivariate-drift-cdd.svg | 2 +- .../classifier-for-drift-detection.svg | 2 +- .../How It Works - Multivariate Drift.ipynb | 2 +- ... - Multivariate - Domain Classifier.ipynb} | 207 +++++++++--------- docs/glossary.rst | 6 + docs/how_it_works/multivariate_drift.rst | 31 ++- .../multivariate_drift_detection.rst | 7 +- .../{cdd.rst => dc.rst} | 34 +-- .../multiv_why.rst | 8 - nannyml/__init__.py | 2 +- nannyml/drift/__init__.py | 8 +- .../__init__.py | 2 +- .../calculator.py | 36 +-- .../result.py | 18 +- nannyml/usage_logging.py | 6 +- .../{test_multiv_cdd.py => test_multiv_dc.py} | 12 +- 16 files changed, 195 insertions(+), 188 deletions(-) rename docs/example_notebooks/{Tutorial - Drift - Multivariate - Classifier for Drift.ipynb => Tutorial - Drift - Multivariate - Domain Classifier.ipynb} (71%) rename docs/tutorials/detecting_data_drift/multivariate_drift_detection/{cdd.rst => dc.rst} (89%) delete mode 100644 docs/tutorials/detecting_data_drift/multivariate_drift_detection/multiv_why.rst rename nannyml/drift/multivariate/{classifier_for_drift_detection => domain_classifier}/__init__.py (95%) rename nannyml/drift/multivariate/{classifier_for_drift_detection => domain_classifier}/calculator.py (92%) rename nannyml/drift/multivariate/{classifier_for_drift_detection => domain_classifier}/result.py (87%) rename tests/drift/{test_multiv_cdd.py => test_multiv_dc.py} (65%) diff --git a/docs/_static/how-it-works/butterfly-multivariate-drift-cdd.svg b/docs/_static/how-it-works/butterfly-multivariate-drift-cdd.svg index 0ce17311..1f8c248f 100644 --- a/docs/_static/how-it-works/butterfly-multivariate-drift-cdd.svg +++ b/docs/_static/how-it-works/butterfly-multivariate-drift-cdd.svg @@ -1 +1 @@ -Feb 2020Mar 2020Apr 2020May 20200.50.60.70.80.91MetricAlertClassifier for Drift DetectionTimeClassifier AUROCReferenceAnalysis \ No newline at end of file +Feb 2020Mar 2020Apr 2020May 20200.50.60.70.80.91MetricAlertDomain ClassifierTimeClassifier AUROCReferenceAnalysis \ No newline at end of file diff --git a/docs/_static/tutorials/detecting_data_drift/multivariate_drift_detection/classifier-for-drift-detection.svg b/docs/_static/tutorials/detecting_data_drift/multivariate_drift_detection/classifier-for-drift-detection.svg index 6846b03e..837cbef9 100644 --- a/docs/_static/tutorials/detecting_data_drift/multivariate_drift_detection/classifier-for-drift-detection.svg +++ b/docs/_static/tutorials/detecting_data_drift/multivariate_drift_detection/classifier-for-drift-detection.svg @@ -1 +1 @@ -Jan 2018Apr 2018Jul 2018Oct 2018Jan 2019Apr 2019Jul 20190.50.60.70.80.9MetricAlertClassifier for Drift DetectionTimeClassifier AUROCReferenceAnalysis \ No newline at end of file +Jan 2018Apr 2018Jul 2018Oct 2018Jan 2019Apr 2019Jul 20190.50.60.70.80.9MetricAlertMultivariate Drift - Domain ClassifierTimeDomain Classifier AUROCReferenceAnalysis \ No newline at end of file diff --git a/docs/example_notebooks/How It Works - Multivariate Drift.ipynb b/docs/example_notebooks/How It Works - Multivariate Drift.ipynb index 916f8cf5..ea56041a 100644 --- a/docs/example_notebooks/How It Works - Multivariate Drift.ipynb +++ b/docs/example_notebooks/How It Works - Multivariate Drift.ipynb @@ -195,7 +195,7 @@ "outputs": [], "source": [ "# Let's compute multivariate drift\n", - "drift_classifier = nml.DriftDetectionClassifierCalculator(\n", + "drift_classifier = nml.DomainClassifierCalculator(\n", " feature_column_names=feature_column_names,\n", " timestamp_column_name='ordered',\n", " chunk_size=DPP\n", diff --git a/docs/example_notebooks/Tutorial - Drift - Multivariate - Classifier for Drift.ipynb b/docs/example_notebooks/Tutorial - Drift - Multivariate - Domain Classifier.ipynb similarity index 71% rename from docs/example_notebooks/Tutorial - Drift - Multivariate - Classifier for Drift.ipynb rename to docs/example_notebooks/Tutorial - Drift - Multivariate - Domain Classifier.ipynb index 9a798043..b1979083 100644 --- a/docs/example_notebooks/Tutorial - Drift - Multivariate - Classifier for Drift.ipynb +++ b/docs/example_notebooks/Tutorial - Drift - Multivariate - Domain Classifier.ipynb @@ -29,6 +29,7 @@ " \n", " \n", " \n", + " id\n", " car_value\n", " salary_range\n", " debt_to_income_ratio\n", @@ -45,6 +46,7 @@ " \n", " \n", " 0\n", + " 0\n", " 39811.0\n", " 40K - 60K €\n", " 0.632950\n", @@ -59,6 +61,7 @@ " \n", " \n", " 1\n", + " 1\n", " 12679.0\n", " 40K - 60K €\n", " 0.718627\n", @@ -73,6 +76,7 @@ " \n", " \n", " 2\n", + " 2\n", " 19847.0\n", " 40K - 60K €\n", " 0.721724\n", @@ -87,6 +91,7 @@ " \n", " \n", " 3\n", + " 3\n", " 22652.0\n", " 20K - 20K €\n", " 0.705992\n", @@ -101,6 +106,7 @@ " \n", " \n", " 4\n", + " 4\n", " 21268.0\n", " 60K+ €\n", " 0.671888\n", @@ -118,12 +124,12 @@ "" ], "text/plain": [ - " car_value salary_range debt_to_income_ratio loan_length \\\n", - "0 39811.0 40K - 60K € 0.632950 19.0 \n", - "1 12679.0 40K - 60K € 0.718627 7.0 \n", - "2 19847.0 40K - 60K € 0.721724 17.0 \n", - "3 22652.0 20K - 20K € 0.705992 16.0 \n", - "4 21268.0 60K+ € 0.671888 21.0 \n", + " id car_value salary_range debt_to_income_ratio loan_length \\\n", + "0 0 39811.0 40K - 60K € 0.632950 19.0 \n", + "1 1 12679.0 40K - 60K € 0.718627 7.0 \n", + "2 2 19847.0 40K - 60K € 0.721724 17.0 \n", + "3 3 22652.0 20K - 20K € 0.705992 16.0 \n", + "4 4 21268.0 60K+ € 0.671888 21.0 \n", "\n", " repaid_loan_on_prev_car size_of_downpayment driver_tenure repaid \\\n", "0 False 40% 0.212653 1.0 \n", @@ -168,19 +174,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "+----+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+\n", - "| | car_value | salary_range | debt_to_income_ratio | loan_length | repaid_loan_on_prev_car | size_of_downpayment | driver_tenure | repaid | timestamp | y_pred_proba | y_pred |\n", - "+====+=============+================+========================+===============+===========================+=======================+=================+==========+=========================+================+==========+\n", - "| 0 | 39811 | 40K - 60K € | 0.63295 | 19 | False | 40% | 0.212653 | 1 | 2018-01-01 00:00:00.000 | 0.99 | 1 |\n", - "+----+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+\n", - "| 1 | 12679 | 40K - 60K € | 0.718627 | 7 | True | 10% | 4.92755 | 0 | 2018-01-01 00:08:43.152 | 0.07 | 0 |\n", - "+----+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+\n", - "| 2 | 19847 | 40K - 60K € | 0.721724 | 17 | False | 0% | 0.520817 | 1 | 2018-01-01 00:17:26.304 | 1 | 1 |\n", - "+----+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+\n", - "| 3 | 22652 | 20K - 20K € | 0.705992 | 16 | False | 10% | 0.453649 | 1 | 2018-01-01 00:26:09.456 | 0.98 | 1 |\n", - "+----+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+\n", - "| 4 | 21268 | 60K+ € | 0.671888 | 21 | True | 30% | 5.69526 | 1 | 2018-01-01 00:34:52.608 | 0.99 | 1 |\n", - "+----+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+\n" + "+----+------+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+\n", + "| | id | car_value | salary_range | debt_to_income_ratio | loan_length | repaid_loan_on_prev_car | size_of_downpayment | driver_tenure | repaid | timestamp | y_pred_proba | y_pred |\n", + "+====+======+=============+================+========================+===============+===========================+=======================+=================+==========+=========================+================+==========+\n", + "| 0 | 0 | 39811 | 40K - 60K € | 0.63295 | 19 | False | 40% | 0.212653 | 1 | 2018-01-01 00:00:00.000 | 0.99 | 1 |\n", + "+----+------+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+\n", + "| 1 | 1 | 12679 | 40K - 60K € | 0.718627 | 7 | True | 10% | 4.92755 | 0 | 2018-01-01 00:08:43.152 | 0.07 | 0 |\n", + "+----+------+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+\n", + "| 2 | 2 | 19847 | 40K - 60K € | 0.721724 | 17 | False | 0% | 0.520817 | 1 | 2018-01-01 00:17:26.304 | 1 | 1 |\n", + "+----+------+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+\n", + "| 3 | 3 | 22652 | 20K - 20K € | 0.705992 | 16 | False | 10% | 0.453649 | 1 | 2018-01-01 00:26:09.456 | 0.98 | 1 |\n", + "+----+------+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+\n", + "| 4 | 4 | 21268 | 60K+ € | 0.671888 | 21 | True | 30% | 5.69526 | 1 | 2018-01-01 00:34:52.608 | 0.99 | 1 |\n", + "+----+------+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+\n" ] } ], @@ -197,15 +203,18 @@ }, "outputs": [], "source": [ - "non_feature_columns = ['timestamp', 'y_pred_proba', 'y_pred', 'repaid']\n", - "\n", "# Define feature columns\n", "feature_column_names = [\n", - " col for col in reference_df.columns\n", - " if col not in non_feature_columns\n", + " 'car_value',\n", + " 'salary_range',\n", + " 'debt_to_income_ratio',\n", + " 'loan_length',\n", + " 'repaid_loan_on_prev_car',\n", + " 'size_of_downpayment',\n", + " 'driver_tenure'\n", "]\n", "\n", - "calc = nml.DriftDetectionClassifierCalculator(\n", + "calc = nml.DomainClassifierCalculator(\n", " feature_column_names=feature_column_names,\n", " timestamp_column_name='timestamp',\n", " chunk_size=5000\n", @@ -244,7 +253,7 @@ " \n", " \n", " chunk\n", - " classifier_auroc\n", + " domain_classifier_auroc\n", " \n", " \n", " \n", @@ -420,18 +429,18 @@ "8 [40000:44999] 8 40000 44999 2019-06-29 22:48:00 \n", "9 [45000:49999] 9 45000 49999 2019-07-30 05:24:00 \n", "\n", - " classifier_auroc \\\n", - " end_date period value upper_threshold \n", - "0 2018-11-30 00:27:16.848 analysis 0.502704 0.65 \n", - "1 2018-12-30 07:03:16.848 analysis 0.496390 0.65 \n", - "2 2019-01-29 13:39:16.848 analysis 0.490815 0.65 \n", - "3 2019-02-28 20:15:16.848 analysis 0.493005 0.65 \n", - "4 2019-03-31 02:51:16.848 analysis 0.503402 0.65 \n", - "5 2019-04-30 09:27:16.848 analysis 0.913519 0.65 \n", - "6 2019-05-30 16:03:16.848 analysis 0.913364 0.65 \n", - "7 2019-06-29 22:39:16.848 analysis 0.916356 0.65 \n", - "8 2019-07-30 05:15:16.848 analysis 0.913297 0.65 \n", - "9 2019-08-29 11:51:16.848 analysis 0.916694 0.65 \n", + " domain_classifier_auroc \\\n", + " end_date period value upper_threshold \n", + "0 2018-11-30 00:27:16.848 analysis 0.502704 0.65 \n", + "1 2018-12-30 07:03:16.848 analysis 0.496390 0.65 \n", + "2 2019-01-29 13:39:16.848 analysis 0.490815 0.65 \n", + "3 2019-02-28 20:15:16.848 analysis 0.493005 0.65 \n", + "4 2019-03-31 02:51:16.848 analysis 0.503402 0.65 \n", + "5 2019-04-30 09:27:16.848 analysis 0.913519 0.65 \n", + "6 2019-05-30 16:03:16.848 analysis 0.913364 0.65 \n", + "7 2019-06-29 22:39:16.848 analysis 0.916356 0.65 \n", + "8 2019-07-30 05:15:16.848 analysis 0.913297 0.65 \n", + "9 2019-08-29 11:51:16.848 analysis 0.916694 0.65 \n", "\n", " \n", " lower_threshold alert \n", @@ -470,30 +479,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| | | chunk | | | | | | | | classifier_auroc | | | |\n", - "| | | key | | chunk_index | | start_index | | end_index | | start_date | | end_date | | period | | value | | upper_threshold | | lower_threshold | | alert |\n", - "+====+===============+=================+=================+===============+=====================+============================+============+======================+=====================+=====================+===========+\n", - "| 0 | [0:4999] | 0 | 0 | 4999 | 2018-10-30 18:00:00 | 2018-11-30 00:27:16.848000 | analysis | 0.502704 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 1 | [5000:9999] | 1 | 5000 | 9999 | 2018-11-30 00:36:00 | 2018-12-30 07:03:16.848000 | analysis | 0.49639 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 2 | [10000:14999] | 2 | 10000 | 14999 | 2018-12-30 07:12:00 | 2019-01-29 13:39:16.848000 | analysis | 0.490815 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 3 | [15000:19999] | 3 | 15000 | 19999 | 2019-01-29 13:48:00 | 2019-02-28 20:15:16.848000 | analysis | 0.493005 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 4 | [20000:24999] | 4 | 20000 | 24999 | 2019-02-28 20:24:00 | 2019-03-31 02:51:16.848000 | analysis | 0.503402 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 5 | [25000:29999] | 5 | 25000 | 29999 | 2019-03-31 03:00:00 | 2019-04-30 09:27:16.848000 | analysis | 0.913519 | 0.65 | 0.45 | True |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 6 | [30000:34999] | 6 | 30000 | 34999 | 2019-04-30 09:36:00 | 2019-05-30 16:03:16.848000 | analysis | 0.913364 | 0.65 | 0.45 | True |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 7 | [35000:39999] | 7 | 35000 | 39999 | 2019-05-30 16:12:00 | 2019-06-29 22:39:16.848000 | analysis | 0.916356 | 0.65 | 0.45 | True |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 8 | [40000:44999] | 8 | 40000 | 44999 | 2019-06-29 22:48:00 | 2019-07-30 05:15:16.848000 | analysis | 0.913297 | 0.65 | 0.45 | True |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 9 | [45000:49999] | 9 | 45000 | 49999 | 2019-07-30 05:24:00 | 2019-08-29 11:51:16.848000 | analysis | 0.916694 | 0.65 | 0.45 | True |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n" + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| | | chunk | | | | | | | | domain_classifier_auroc | | | |\n", + "| | | key | | chunk_index | | start_index | | end_index | | start_date | | end_date | | period | | value | | upper_threshold | | lower_threshold | | alert |\n", + "+====+===============+=================+=================+===============+=====================+============================+============+=============================+=====================+=====================+===========+\n", + "| 0 | [0:4999] | 0 | 0 | 4999 | 2018-10-30 18:00:00 | 2018-11-30 00:27:16.848000 | analysis | 0.502704 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 1 | [5000:9999] | 1 | 5000 | 9999 | 2018-11-30 00:36:00 | 2018-12-30 07:03:16.848000 | analysis | 0.49639 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 2 | [10000:14999] | 2 | 10000 | 14999 | 2018-12-30 07:12:00 | 2019-01-29 13:39:16.848000 | analysis | 0.490815 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 3 | [15000:19999] | 3 | 15000 | 19999 | 2019-01-29 13:48:00 | 2019-02-28 20:15:16.848000 | analysis | 0.493005 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 4 | [20000:24999] | 4 | 20000 | 24999 | 2019-02-28 20:24:00 | 2019-03-31 02:51:16.848000 | analysis | 0.503402 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 5 | [25000:29999] | 5 | 25000 | 29999 | 2019-03-31 03:00:00 | 2019-04-30 09:27:16.848000 | analysis | 0.913519 | 0.65 | 0.45 | True |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 6 | [30000:34999] | 6 | 30000 | 34999 | 2019-04-30 09:36:00 | 2019-05-30 16:03:16.848000 | analysis | 0.913364 | 0.65 | 0.45 | True |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 7 | [35000:39999] | 7 | 35000 | 39999 | 2019-05-30 16:12:00 | 2019-06-29 22:39:16.848000 | analysis | 0.916356 | 0.65 | 0.45 | True |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 8 | [40000:44999] | 8 | 40000 | 44999 | 2019-06-29 22:48:00 | 2019-07-30 05:15:16.848000 | analysis | 0.913297 | 0.65 | 0.45 | True |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 9 | [45000:49999] | 9 | 45000 | 49999 | 2019-07-30 05:24:00 | 2019-08-29 11:51:16.848000 | analysis | 0.916694 | 0.65 | 0.45 | True |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n" ] } ], @@ -535,7 +544,7 @@ " \n", " \n", " chunk\n", - " classifier_auroc\n", + " domain_classifier_auroc\n", " \n", " \n", " \n", @@ -711,18 +720,18 @@ "8 [40000:44999] 8 40000 44999 2018-08-31 04:48:00 \n", "9 [45000:49999] 9 45000 49999 2018-09-30 11:24:00 \n", "\n", - " classifier_auroc \\\n", - " end_date period value upper_threshold \n", - "0 2018-01-31 06:27:16.848 reference 0.508085 0.65 \n", - "1 2018-03-02 13:03:16.848 reference 0.505428 0.65 \n", - "2 2018-04-01 19:39:16.848 reference 0.506587 0.65 \n", - "3 2018-05-02 02:15:16.848 reference 0.499824 0.65 \n", - "4 2018-06-01 08:51:16.848 reference 0.507135 0.65 \n", - "5 2018-07-01 15:27:16.848 reference 0.498486 0.65 \n", - "6 2018-07-31 22:03:16.848 reference 0.501805 0.65 \n", - "7 2018-08-31 04:39:16.848 reference 0.494281 0.65 \n", - "8 2018-09-30 11:15:16.848 reference 0.505302 0.65 \n", - "9 2018-10-30 17:51:16.848 reference 0.502734 0.65 \n", + " domain_classifier_auroc \\\n", + " end_date period value upper_threshold \n", + "0 2018-01-31 06:27:16.848 reference 0.508085 0.65 \n", + "1 2018-03-02 13:03:16.848 reference 0.505428 0.65 \n", + "2 2018-04-01 19:39:16.848 reference 0.506587 0.65 \n", + "3 2018-05-02 02:15:16.848 reference 0.499824 0.65 \n", + "4 2018-06-01 08:51:16.848 reference 0.507135 0.65 \n", + "5 2018-07-01 15:27:16.848 reference 0.498486 0.65 \n", + "6 2018-07-31 22:03:16.848 reference 0.501805 0.65 \n", + "7 2018-08-31 04:39:16.848 reference 0.494281 0.65 \n", + "8 2018-09-30 11:15:16.848 reference 0.505302 0.65 \n", + "9 2018-10-30 17:51:16.848 reference 0.502734 0.65 \n", "\n", " \n", " lower_threshold alert \n", @@ -761,30 +770,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| | | chunk | | | | | | | | classifier_auroc | | | |\n", - "| | | key | | chunk_index | | start_index | | end_index | | start_date | | end_date | | period | | value | | upper_threshold | | lower_threshold | | alert |\n", - "+====+===============+=================+=================+===============+=====================+============================+============+======================+=====================+=====================+===========+\n", - "| 0 | [0:4999] | 0 | 0 | 4999 | 2018-01-01 00:00:00 | 2018-01-31 06:27:16.848000 | reference | 0.508085 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 1 | [5000:9999] | 1 | 5000 | 9999 | 2018-01-31 06:36:00 | 2018-03-02 13:03:16.848000 | reference | 0.505428 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 2 | [10000:14999] | 2 | 10000 | 14999 | 2018-03-02 13:12:00 | 2018-04-01 19:39:16.848000 | reference | 0.506587 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 3 | [15000:19999] | 3 | 15000 | 19999 | 2018-04-01 19:48:00 | 2018-05-02 02:15:16.848000 | reference | 0.499824 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 4 | [20000:24999] | 4 | 20000 | 24999 | 2018-05-02 02:24:00 | 2018-06-01 08:51:16.848000 | reference | 0.507135 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 5 | [25000:29999] | 5 | 25000 | 29999 | 2018-06-01 09:00:00 | 2018-07-01 15:27:16.848000 | reference | 0.498486 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 6 | [30000:34999] | 6 | 30000 | 34999 | 2018-07-01 15:36:00 | 2018-07-31 22:03:16.848000 | reference | 0.501805 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 7 | [35000:39999] | 7 | 35000 | 39999 | 2018-07-31 22:12:00 | 2018-08-31 04:39:16.848000 | reference | 0.494281 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 8 | [40000:44999] | 8 | 40000 | 44999 | 2018-08-31 04:48:00 | 2018-09-30 11:15:16.848000 | reference | 0.505302 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n", - "| 9 | [45000:49999] | 9 | 45000 | 49999 | 2018-09-30 11:24:00 | 2018-10-30 17:51:16.848000 | reference | 0.502734 | 0.65 | 0.45 | False |\n", - "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+----------------------+---------------------+---------------------+-----------+\n" + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| | | chunk | | | | | | | | domain_classifier_auroc | | | |\n", + "| | | key | | chunk_index | | start_index | | end_index | | start_date | | end_date | | period | | value | | upper_threshold | | lower_threshold | | alert |\n", + "+====+===============+=================+=================+===============+=====================+============================+============+=============================+=====================+=====================+===========+\n", + "| 0 | [0:4999] | 0 | 0 | 4999 | 2018-01-01 00:00:00 | 2018-01-31 06:27:16.848000 | reference | 0.508085 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 1 | [5000:9999] | 1 | 5000 | 9999 | 2018-01-31 06:36:00 | 2018-03-02 13:03:16.848000 | reference | 0.505428 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 2 | [10000:14999] | 2 | 10000 | 14999 | 2018-03-02 13:12:00 | 2018-04-01 19:39:16.848000 | reference | 0.506587 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 3 | [15000:19999] | 3 | 15000 | 19999 | 2018-04-01 19:48:00 | 2018-05-02 02:15:16.848000 | reference | 0.499824 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 4 | [20000:24999] | 4 | 20000 | 24999 | 2018-05-02 02:24:00 | 2018-06-01 08:51:16.848000 | reference | 0.507135 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 5 | [25000:29999] | 5 | 25000 | 29999 | 2018-06-01 09:00:00 | 2018-07-01 15:27:16.848000 | reference | 0.498486 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 6 | [30000:34999] | 6 | 30000 | 34999 | 2018-07-01 15:36:00 | 2018-07-31 22:03:16.848000 | reference | 0.501805 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 7 | [35000:39999] | 7 | 35000 | 39999 | 2018-07-31 22:12:00 | 2018-08-31 04:39:16.848000 | reference | 0.494281 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 8 | [40000:44999] | 8 | 40000 | 44999 | 2018-08-31 04:48:00 | 2018-09-30 11:15:16.848000 | reference | 0.505302 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n", + "| 9 | [45000:49999] | 9 | 45000 | 49999 | 2018-09-30 11:24:00 | 2018-10-30 17:51:16.848000 | reference | 0.502734 | 0.65 | 0.45 | False |\n", + "+----+---------------+-----------------+-----------------+---------------+---------------------+----------------------------+------------+-----------------------------+---------------------+---------------------+-----------+\n" ] } ], @@ -834,7 +843,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.11.8" } }, "nbformat": 4, diff --git a/docs/glossary.rst b/docs/glossary.rst index b03be8f7..6f664acf 100644 --- a/docs/glossary.rst +++ b/docs/glossary.rst @@ -110,6 +110,12 @@ Glossary You can read more about Data Periods in the :ref:`relevant data requirements section`. + Domain Classifier + A domain classifer is a machine learning classification model trained to identify whether a given data point + belongs to one or another dataset. NannyML uses domain classifers as a multivariate drift detection method. + You can read more about them in :ref:`How it works: Domain Classifier` and see how to use + them in :ref:`Tutorial: Domain Classifier`. + Error The error of a statistic on a sample is defined as the difference between the value of the observation and the true value. The sample size can sometimes be 1 but it is usually bigger. When the error consists only of the effects diff --git a/docs/how_it_works/multivariate_drift.rst b/docs/how_it_works/multivariate_drift.rst index 93355b1f..9e106721 100644 --- a/docs/how_it_works/multivariate_drift.rst +++ b/docs/how_it_works/multivariate_drift.rst @@ -149,18 +149,17 @@ For more information on using Reconstruction Error with PCA check the :ref:`Multivariate Drift - Data Reconstruction with PCA` tutorial. -.. _how-multiv-drift-cdd: +.. _how-multiv-drift-dc: -Classifier for Drift Detection ------------------------------- +Domain Classifier +----------------- -Classifier for drift detection provides a measure of how easy it is to discriminate -the reference data from the examined chunk data. It is an implementation of domain classifiers, as -they are called in `relevant literature`_, using a LightGBM classifier. +A :term:`Domain Classifier` allows us to create a measure of how easy it is to discriminate +the reference data from the examined chunk data. NannyML uses a LightGBM classifier. As a measure of discrimination performance NannyML uses the cross-validated AUROC score. Similar to data reconstruction with PCA this method is also able to capture complex changes in our data. -The algorithm implementing Classifier for Drift Detection follows the steps described below. +The algorithm implementing Domain Classifier follows the steps described below. Please note that the process described below is repeated for each :term:`Data Chunk`. The process consists of two basic parts, data preprocessing and classifier cross validation. @@ -187,10 +186,10 @@ The higher the AUROC score the easier it is to distinguish the datasets, hence t more different they are. -Understanding Classifier for Drift Detection -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Understanding Domain Classifier +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The Classifier for Drift Detection method relines on a machine learning +The Domain Classifier method relies on a machine learning algorithm to distinguish between the reference and the chunk data. We are using a LightGBM Classifier. Because of the versatility of this approach the classifier is quite sensitive to shifts in the data. @@ -199,10 +198,10 @@ directly translate classifier AUROC values to possible performance impact. It is better to rely on :ref:`performance estimation` methods for that. -Classifier for Drift Detection on the butterfly dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Domain Classifier on the butterfly dataset +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Now that we have a better understanding of Classifier for Drift Detection, let's see +Now that we have a better understanding of Domain Classifier, let's see how it performs on the butterfly dataset. .. nbimport:: @@ -214,8 +213,6 @@ how it performs on the butterfly dataset. The change in the butterfly dataset is now clearly visible through the change in the classifier's AUROC, while our earlier univariate approach detected no change. -For more information on using Classifier for Drift Detection check -the :ref:`Multivariate Drift - Classifier for Drift Detection` +For more information on using Domain Classifier check +the :ref:`Multivariate Drift - Domain Classifier` tutorial. - -.. _`relevant literature`: https://arxiv.org/abs/1810.11953 diff --git a/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst b/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst index 0eb3995d..f388565c 100644 --- a/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst +++ b/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst @@ -4,10 +4,13 @@ Multivariate Drift Detection ============================ +Multivariate data drift detection compliments :ref:`univariate data drift detection methods`. +It provides one summary number reducing the risk of false alerts, and detects more subtle changes +in the data structure that cannot be detected with univariate approaches. The trade off is that +multivariate drift results are less explainable compared to univariate drift results. .. toctree:: :maxdepth: 2 - multivariate_drift_detection/multiv_why multivariate_drift_detection/pca - multivariate_drift_detection/cdd + multivariate_drift_detection/dc diff --git a/docs/tutorials/detecting_data_drift/multivariate_drift_detection/cdd.rst b/docs/tutorials/detecting_data_drift/multivariate_drift_detection/dc.rst similarity index 89% rename from docs/tutorials/detecting_data_drift/multivariate_drift_detection/cdd.rst rename to docs/tutorials/detecting_data_drift/multivariate_drift_detection/dc.rst index d902473f..e59afdcf 100644 --- a/docs/tutorials/detecting_data_drift/multivariate_drift_detection/cdd.rst +++ b/docs/tutorials/detecting_data_drift/multivariate_drift_detection/dc.rst @@ -1,12 +1,12 @@ -.. _multivariate_drift_detection_cdd: +.. _multivariate_drift_detection_dc: -============================== -Classifier for Drift Detection -============================== +================= +Domain Classifier +================= -The second multivariate drift detection method of NannyML is Classifier for Drift Detection. +The second multivariate drift detection method of NannyML is Domain Classifier. It provides a measure of how easy it is to discriminate the reference data from the examined chunk data. -You can read more about on the :ref:`How it works: Classifier for Drift Detection` section. +You can read more about on the :ref:`How it works: Domain Classifier` section. When there is no data drift the datasets can't discerned and we get a value of 0.5. The more drift there is, the higher the returned measure will be, up to a value of 1. @@ -14,7 +14,7 @@ Just The Code ------------- .. nbimport:: - :path: ./example_notebooks/Tutorial - Drift - Multivariate - Classifier for Drift.ipynb + :path: ./example_notebooks/Tutorial - Drift - Multivariate - Domain Classifier.ipynb :cells: 1 3 4 6 8 .. admonition:: **Advanced configuration** @@ -43,14 +43,14 @@ Let's start by loading some synthetic data provided by the NannyML package set i This synthetic data is for a binary classification model, but multi-class classification can be handled in the same way. .. nbimport:: - :path: ./example_notebooks/Tutorial - Drift - Multivariate - Classifier for Drift.ipynb + :path: ./example_notebooks/Tutorial - Drift - Multivariate - Domain Classifier.ipynb :cells: 1 .. nbtable:: - :path: ./example_notebooks/Tutorial - Drift - Multivariate - Classifier for Drift.ipynb + :path: ./example_notebooks/Tutorial - Drift - Multivariate - Domain Classifier.ipynb :cell: 2 -The :class:`~nannyml.drift.multivariate.classifier_for_drift_detection.calculator.DriftDetectionClassifierCalculator` +The :class:`~nannyml.drift.multivariate.domain_classifier.calculator.DomainClassifierCalculator` module implements this functionality. We need to instantiate it with appropriate parameters: - **feature_column_names:** A list with the column names of the features we want to run drift detection on. @@ -67,7 +67,7 @@ module implements this functionality. We need to instantiate it with appropriate order to create chunks. - **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation provided data in order to create chunks. -- **cv_folds_num (Optional):** Number of cross-validation folds to use when calculating CDD discrimination value. +- **cv_folds_num (Optional):** Number of cross-validation folds to use when calculating DC discrimination value. - **hyperparameters (Optional):** A dictionary used to provide your own custom hyperparameters when training the discrimination model. Check out the available hyperparameter options in the `LightGBM docs`_. - **tune_hyperparameters (Optional):** A boolean controlling whether hypertuning should be performed on the internal @@ -84,7 +84,7 @@ which the results will be based on. Then the calculate the multivariate drift results on the provided data. .. nbimport:: - :path: ./example_notebooks/Tutorial - Drift - Multivariate - Classifier for Drift.ipynb + :path: ./example_notebooks/Tutorial - Drift - Multivariate - Domain Classifier.ipynb :cells: 3 We can see these results of the data provided to the @@ -92,21 +92,21 @@ We can see these results of the data provided to the method as a dataframe. .. nbimport:: - :path: ./example_notebooks/Tutorial - Drift - Multivariate - Classifier for Drift.ipynb + :path: ./example_notebooks/Tutorial - Drift - Multivariate - Domain Classifier.ipynb :cells: 4 .. nbtable:: - :path: ./example_notebooks/Tutorial - Drift - Multivariate - Classifier for Drift.ipynb + :path: ./example_notebooks/Tutorial - Drift - Multivariate - Domain Classifier.ipynb :cell: 5 The drift results from the reference data are accessible from the properties of the results object: .. nbimport:: - :path: ./example_notebooks/Tutorial - Drift - Multivariate - Classifier for Drift.ipynb + :path: ./example_notebooks/Tutorial - Drift - Multivariate - Domain Classifier.ipynb :cells: 6 .. nbtable:: - :path: ./example_notebooks/Tutorial - Drift - Multivariate - Classifier for Drift.ipynb + :path: ./example_notebooks/Tutorial - Drift - Multivariate - Domain Classifier.ipynb :cell: 7 @@ -119,7 +119,7 @@ NannyML can also visualize the multivariate drift results in a plot. Our plot co A red, diamond-shaped point marker additionally indicates this in the middle of the chunk. .. nbimport:: - :path: ./example_notebooks/Tutorial - Drift - Multivariate - Classifier for Drift.ipynb + :path: ./example_notebooks/Tutorial - Drift - Multivariate - Domain Classifier.ipynb :cells: 8 .. image:: /_static/tutorials/detecting_data_drift/multivariate_drift_detection/classifier-for-drift-detection.svg diff --git a/docs/tutorials/detecting_data_drift/multivariate_drift_detection/multiv_why.rst b/docs/tutorials/detecting_data_drift/multivariate_drift_detection/multiv_why.rst deleted file mode 100644 index 821ea8d8..00000000 --- a/docs/tutorials/detecting_data_drift/multivariate_drift_detection/multiv_why.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. _multivariate_drift_detection_why: - -Why Perform Multivariate Drift Detection ----------------------------------------- - -Multivariate data drift detection addresses the shortcomings of :ref:`univariate data detection methods`. -It provides one summary number reducing the risk of false alerts, and detects more subtle changes -in the data structure that cannot be detected with univariate approaches. diff --git a/nannyml/__init__.py b/nannyml/__init__.py index 6b8fbb82..30e8311f 100644 --- a/nannyml/__init__.py +++ b/nannyml/__init__.py @@ -55,7 +55,7 @@ AlertCountRanker, CorrelationRanker, DataReconstructionDriftCalculator, - DriftDetectionClassifierCalculator, + DomainClassifierCalculator, UnivariateDriftCalculator, ) from .exceptions import ChunkerException, InvalidArgumentsException, MissingMetadataException diff --git a/nannyml/drift/__init__.py b/nannyml/drift/__init__.py index 0f97271d..f53ee9c5 100644 --- a/nannyml/drift/__init__.py +++ b/nannyml/drift/__init__.py @@ -19,11 +19,11 @@ The multivariate drift detection methods include: - Data reconstruction error: detects drift by performing dimensionality reduction on the model - inputs and then applying the inverse transformation on the latent (reduced) space. - - + inputs using PCA and then applying the inverse transformation on the latent (reduced) space. +- Domain Classifer: detects drift by looking at how performance a domain classifier is at distinguising + between the reference and the chunk datasets. """ -from .multivariate.classifier_for_drift_detection import DriftDetectionClassifierCalculator +from .multivariate.domain_classifier import DomainClassifierCalculator from .multivariate.data_reconstruction import DataReconstructionDriftCalculator from .ranker import AlertCountRanker, CorrelationRanker from .univariate import FeatureType, Method, MethodFactory, UnivariateDriftCalculator diff --git a/nannyml/drift/multivariate/classifier_for_drift_detection/__init__.py b/nannyml/drift/multivariate/domain_classifier/__init__.py similarity index 95% rename from nannyml/drift/multivariate/classifier_for_drift_detection/__init__.py rename to nannyml/drift/multivariate/domain_classifier/__init__.py index 3589a394..c646ac23 100644 --- a/nannyml/drift/multivariate/classifier_for_drift_detection/__init__.py +++ b/nannyml/drift/multivariate/domain_classifier/__init__.py @@ -25,5 +25,5 @@ """ -from .calculator import DriftDetectionClassifierCalculator +from .calculator import DomainClassifierCalculator from .result import Result diff --git a/nannyml/drift/multivariate/classifier_for_drift_detection/calculator.py b/nannyml/drift/multivariate/domain_classifier/calculator.py similarity index 92% rename from nannyml/drift/multivariate/classifier_for_drift_detection/calculator.py rename to nannyml/drift/multivariate/domain_classifier/calculator.py index 08ab9391..5228c0d1 100644 --- a/nannyml/drift/multivariate/classifier_for_drift_detection/calculator.py +++ b/nannyml/drift/multivariate/domain_classifier/calculator.py @@ -28,7 +28,7 @@ from nannyml.base import AbstractCalculator, _list_missing, _split_features_by_type from nannyml.chunk import Chunker -from nannyml.drift.multivariate.classifier_for_drift_detection.result import Result +from nannyml.drift.multivariate.domain_classifier.result import Result from nannyml.exceptions import InvalidArgumentsException # from nannyml.sampling_error import SAMPLING_ERROR_RANGE @@ -71,8 +71,8 @@ } -class DriftDetectionClassifierCalculator(AbstractCalculator): - """DriftDetectionClassifierCalculator implementation. +class DomainClassifierCalculator(AbstractCalculator): + """DomainClassifierCalculator implementation. Uses Drift Detection Classifier's cross validated performance as a measure of drift. """ @@ -92,7 +92,7 @@ def __init__( hyperparameter_tuning_config: Optional[Dict[str, Any]] = DEFAULT_LGBM_HYPERPARAM_TUNING_CONFIG, threshold: Threshold = ConstantThreshold(lower=0.45, upper=0.65), ): - """Create a new DriftDetectionClassifierCalculator instance. + """Create a new DomainClassifierCalculator instance. Parameters: ----------- @@ -116,7 +116,7 @@ def __init__( chunker : Chunker, default=None The `Chunker` used to split the data sets into a lists of chunks. cv_folds_num: Optional[int] - Number of cross-validation folds to use when calculating CDD discrimination value. + Number of cross-validation folds to use when calculating DC discrimination value. hyperparameters : Dict[str, Any], default = None A dictionary used to provide your own custom hyperparameters when training the discrimination model. Check out the available hyperparameter options in the @@ -159,7 +159,7 @@ def __init__( ... col for col in reference_df.columns ... if col not in non_feature_columns >>> ] - >>> calc = nml.DriftDetectionClassifierCalculator( + >>> calc = nml.DomainClassifierCalculator( ... feature_column_names=feature_column_names, ... timestamp_column_name='timestamp', ... chunk_size=5000 @@ -169,7 +169,7 @@ def __init__( >>> figure = results.plot() >>> figure.show() """ - super(DriftDetectionClassifierCalculator, self).__init__( + super(DomainClassifierCalculator, self).__init__( chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name ) if isinstance(feature_column_names, str): @@ -201,9 +201,9 @@ def __init__( # self._sampling_error_components: Tuple = () self.result: Optional[Result] = None - @log_usage(UsageEvent.CDD_CALC_FIT) + @log_usage(UsageEvent.DC_CALC_FIT) def _fit(self, reference_data: pd.DataFrame, *args, **kwargs): - """Fits the CDD calculator to a set of reference data.""" + """Fits the DC calculator to a set of reference data.""" if reference_data.empty: raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.') @@ -232,9 +232,9 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs): return self - @log_usage(UsageEvent.CDD_CALC_RUN) + @log_usage(UsageEvent.DC_CALC_RUN) def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result: - """Calculate the data CDD calculator metric for a given data set.""" + """Calculate the data DC calculator metric for a given data set.""" if data.empty: raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.') @@ -330,20 +330,20 @@ def _calculate_chunk(self, data: pd.DataFrame): def _set_metric_thresholds(self, result_data: pd.DataFrame): self.lower_threshold_value, self.upper_threshold_value = calculate_threshold_values( threshold=self.threshold, - data=result_data.loc[:, ('classifier_auroc', 'value')], + data=result_data.loc[:, ('domain_classifier_auroc', 'value')], lower_threshold_value_limit=self._lower_threshold_value_limit, upper_threshold_value_limit=self._upper_threshold_value_limit, logger=self._logger, ) def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame: - result_data[('classifier_auroc', 'upper_threshold')] = self.upper_threshold_value - result_data[('classifier_auroc', 'lower_threshold')] = self.lower_threshold_value - result_data[('classifier_auroc', 'alert')] = result_data.apply( + result_data[('domain_classifier_auroc', 'upper_threshold')] = self.upper_threshold_value + result_data[('domain_classifier_auroc', 'lower_threshold')] = self.lower_threshold_value + result_data[('domain_classifier_auroc', 'alert')] = result_data.apply( lambda row: True if ( - row[('classifier_auroc', 'value')] > row[('classifier_auroc', 'upper_threshold')] - or row[('classifier_auroc', 'value')] < row[('classifier_auroc', 'lower_threshold')] + row[('domain_classifier_auroc', 'value')] > row[('domain_classifier_auroc', 'upper_threshold')] + or row[('domain_classifier_auroc', 'value')] < row[('domain_classifier_auroc', 'lower_threshold')] ) else False, axis=1, @@ -401,7 +401,7 @@ def _create_multilevel_index(include_thresholds: bool = False): 'alert', ] chunk_tuples = [('chunk', chunk_column_name) for chunk_column_name in chunk_column_names] - reconstruction_tuples = [('classifier_auroc', column_name) for column_name in results_column_names] + reconstruction_tuples = [('domain_classifier_auroc', column_name) for column_name in results_column_names] tuples = chunk_tuples + reconstruction_tuples diff --git a/nannyml/drift/multivariate/classifier_for_drift_detection/result.py b/nannyml/drift/multivariate/domain_classifier/result.py similarity index 87% rename from nannyml/drift/multivariate/classifier_for_drift_detection/result.py rename to nannyml/drift/multivariate/domain_classifier/result.py index c58bc0b8..339bb62a 100644 --- a/nannyml/drift/multivariate/classifier_for_drift_detection/result.py +++ b/nannyml/drift/multivariate/domain_classifier/result.py @@ -34,12 +34,12 @@ def __init__( continuous_column_names: List[str], timestamp_column_name: Optional[str] = None, ): - """Initialize a DriftDetectionClassifierCalculator results object. + """Initialize a DomainClassifierCalculator results object. Parameters ---------- results_data: pd.DataFrame - Results data returned by a DriftDetectionClassifierCalculator. + Results data returned by a DomainClassifierCalculator. column_names: List[str] A list of column names indicating which columns contain feature values. categorical_column_names : List[str] @@ -50,7 +50,7 @@ def __init__( The name of the column containing the timestamp of the model prediction. If not given, plots will not use a time-based x-axis but will use the index of the chunks instead. """ - metric = Metric(display_name='Classifier for Drift Detection', column_name='classifier_auroc') + metric = Metric(display_name='Domain Classifier', column_name='domain_classifier_auroc') super().__init__(results_data, [metric]) self.column_names = column_names @@ -60,9 +60,9 @@ def __init__( def keys(self) -> List[Key]: """Create a list of keys where each Key is a `namedtuple('Key', 'properties display_names')`.""" - return [Key(properties=('classifier_auroc',), display_names=('Classifier AUROC ',))] + return [Key(properties=('domain_classifier_auroc',), display_names=('Classifier AUROC ',))] - @log_usage(UsageEvent.CDD_RESULTS_PLOT, metadata_from_kwargs=['kind']) + @log_usage(UsageEvent.DC_RESULTS_PLOT, metadata_from_kwargs=['kind']) def plot(self, kind: str = 'drift', *args, **kwargs) -> go.Figure: """Render plots for metrics returned by the multivariate classifier for drift detection. @@ -93,7 +93,7 @@ def plot(self, kind: str = 'drift', *args, **kwargs) -> go.Figure: ... col for col in reference_df.columns ... if col not in non_feature_columns >>> ] - >>> calc = nml.DriftDetectionClassifierCalculator( + >>> calc = nml.DomainClassifierCalculator( ... feature_column_names=feature_column_names, ... timestamp_column_name='timestamp', ... chunk_size=5000 @@ -106,9 +106,9 @@ def plot(self, kind: str = 'drift', *args, **kwargs) -> go.Figure: if kind == 'drift': return plot_metric( self, - title='Classifier for Drift Detection', - metric_display_name='Classifier AUROC ', - metric_column_name='classifier_auroc', + title='Multivariate Drift - Domain Classifier', + metric_display_name='Domain Classifier AUROC ', + metric_column_name='domain_classifier_auroc', hover=Hover( template='%{period}     %{alert}
' 'Chunk: %{chunk_key}     %{x_coordinate}
' diff --git a/nannyml/usage_logging.py b/nannyml/usage_logging.py index e8798c6f..54436921 100644 --- a/nannyml/usage_logging.py +++ b/nannyml/usage_logging.py @@ -78,9 +78,9 @@ class UsageEvent(str, Enum): MULTIVAR_DRIFT_CALC_RUN = "Multivariate reconstruction error drift calculator run" MULTIVAR_DRIFT_PLOT = "Multivariate drift results plot" - CDD_CALC_FIT = "Classifier for Drift Detection calculator fit" - CDD_CALC_RUN = "Classifier for Drift Detection calculator run" - CDD_RESULTS_PLOT = "Classifier for Drift Detection results plot" + DC_CALC_FIT = "Domain Classifier calculator fit" + DC_CALC_RUN = "Domain Classifier calculator run" + DC_RESULTS_PLOT = "Domain Classifier results plot" PERFORMANCE_CALC_FIT = "Realized performance calculator fit" PERFORMANCE_CALC_RUN = "Realized performance calculator run" diff --git a/tests/drift/test_multiv_cdd.py b/tests/drift/test_multiv_dc.py similarity index 65% rename from tests/drift/test_multiv_cdd.py rename to tests/drift/test_multiv_dc.py index e234e9d1..42bb0e57 100644 --- a/tests/drift/test_multiv_cdd.py +++ b/tests/drift/test_multiv_dc.py @@ -3,7 +3,7 @@ # # License: Apache Software License 2.0 -"""Tests for Multivariate Classifier for Drift Detection package.""" +"""Tests for Multivariate Domain Classifier package.""" from typing import Tuple @@ -13,7 +13,7 @@ from nannyml.datasets import load_synthetic_car_loan_dataset # from nannyml._typing import Result -from nannyml.drift.multivariate.classifier_for_drift_detection.calculator import DriftDetectionClassifierCalculator +from nannyml.drift.multivariate.domain_classifier.calculator import DomainClassifierCalculator column_names1 = [ 'salary_range', @@ -33,19 +33,19 @@ def binary_classification_data() -> Tuple[pd.DataFrame, pd.DataFrame]: # noqa: def test_default_cdd_run(binary_classification_data): - """Test a default run of CDD.""" + """Test a default run of DC.""" ( reference, analysis, ) = binary_classification_data - calc = DriftDetectionClassifierCalculator(feature_column_names=column_names1, chunk_size=5_000) + calc = DomainClassifierCalculator(feature_column_names=column_names1, chunk_size=5_000) calc.fit(reference) results = calc.calculate(analysis) - assert list(results.to_df().loc[:, ("classifier_auroc", "value")].round(4)) == [ + assert list(results.to_df().loc[:, ("domain_classifier_auroc", "value")].round(4)) == [ 0.5020, 0.5002, 0.5174, 0.9108, 0.9136, ] - assert list(results.to_df().loc[:, ("classifier_auroc", "alert")]) == [False, False, False, True, True] + assert list(results.to_df().loc[:, ("domain_classifier_auroc", "alert")]) == [False, False, False, True, True]