[skip ci] Merge remote-tracking branch 'origin/main' into main

NannyML · Sep 16, 2022 · 9549034 · 9549034
2 parents 181f75b + 2ef024e
commit 9549034
Show file tree

Hide file tree

Showing 66 changed files with 4,167 additions and 360 deletions.
diff --git a/docs/_static/quick-start-drift-distance_from_office.svg b/docs/_static/quick-start-drift-distance_from_office.svg
diff --git a/docs/_static/quick-start-drift-gas_price_per_litre.svg b/docs/_static/quick-start-drift-gas_price_per_litre.svg
diff --git a/docs/_static/quick-start-drift-multivariate.svg b/docs/_static/quick-start-drift-multivariate.svg
diff --git a/docs/_static/quick-start-drift-public_transportation_cost.svg b/docs/_static/quick-start-drift-public_transportation_cost.svg
diff --git a/docs/_static/quick-start-drift-salary_range.svg b/docs/_static/quick-start-drift-salary_range.svg
diff --git a/docs/_static/quick-start-drift-tenure.svg b/docs/_static/quick-start-drift-tenure.svg
diff --git a/docs/_static/quick-start-drift-wfh_prev_workday.svg b/docs/_static/quick-start-drift-wfh_prev_workday.svg
diff --git a/docs/_static/quick-start-drift-workday.svg b/docs/_static/quick-start-drift-workday.svg
diff --git a/docs/_static/quick-start-perf-est.svg b/docs/_static/quick-start-perf-est.svg
diff --git a/docs/_static/quick-start-score-drift.svg b/docs/_static/quick-start-score-drift.svg
diff --git a/docs/example_notebooks/Quickstart.ipynb b/docs/example_notebooks/Quickstart.ipynb
@@ -391,7 +391,6 @@
     "    y_pred_proba='y_pred_proba',\n",
     "    y_pred='y_pred',\n",
     "    y_true='work_home_actual',\n",
-    "    timestamp_column_name='timestamp',\n",
     "    metrics=['roc_auc'],\n",
     "    chunk_size=chunk_size,\n",
     "    problem_type='classification_binary',\n",
@@ -427,7 +426,6 @@
     "# Let's initialize the object that will perform the Univariate Drift calculations\n",
     "univariate_calculator = nml.UnivariateStatisticalDriftCalculator(\n",
     "    feature_column_names=feature_column_names,\n",
-    "    timestamp_column_name='timestamp',\n",
     "    chunk_size=chunk_size\n",
     ")\n",
     "univariate_calculator = univariate_calculator.fit(reference)\n",
@@ -600,7 +598,6 @@
     "calc = nml.StatisticalOutputDriftCalculator(\n",
     "    y_pred='y_pred',\n",
     "    y_pred_proba='y_pred_proba',\n",
-    "    timestamp_column_name='timestamp',\n",
     "    problem_type='classification_binary'\n",
     ")\n",
     "calc.fit(reference)\n",
@@ -626,7 +623,10 @@
    "outputs": [],
    "source": [
     "# Let's initialize the object that will perform Data Reconstruction with PCA\n",
-    "rcerror_calculator = nml.DataReconstructionDriftCalculator(feature_column_names=feature_column_names, timestamp_column_name='timestamp', chunk_size=chunk_size).fit(reference_data=reference)\n",
+    "rcerror_calculator = nml.DataReconstructionDriftCalculator(\n",
+    "    feature_column_names=feature_column_names,\n",
+    "    chunk_size=chunk_size\n",
+    ").fit(reference_data=reference)\n",
     "# let's see Reconstruction error statistics for all available data\n",
     "rcerror_results = rcerror_calculator.calculate(analysis)\n",
     "figure = rcerror_results.plot(kind='drift', plot_reference=True)\n",

diff --git a/docs/quick.rst b/docs/quick.rst
@@ -42,6 +42,12 @@ concepts and functionalities. If you want to know what is implemented under the
 visit :ref:`how it works<how_it_works>`. Finally, if you just look for examples
 on other datasets or ML problems look through our :ref:`examples<examples>`.
 
+.. note::
+    The following example does not use any :term:`timestamps<Timestamp>`.
+    These are optional but have an impact on the way data is chunked and results are plotted.
+    You can read more about them in the :ref:`data requirements<data_requirements_columns_timestamp>`.
+
+
 
 -------------
 Just the code

diff --git a/docs/tutorials/data_requirements.rst b/docs/tutorials/data_requirements.rst
@@ -109,6 +109,8 @@ Below we see the columns our dataset contains and explain their purpose.
 +----+------------------------+----------------+-----------------------+------------------------------+--------------------+-----------+----------+
 
 
+.. _data_requirements_columns_timestamp:
+
 Timestamp
 ^^^^^^^^^
 
@@ -124,7 +126,24 @@ In the sample data this is the ``timestamp`` column.
         - *ISO 8601*, e.g. ``2021-10-13T08:47:23Z``
         - *Unix-epoch* in units of seconds, e.g. ``1513393355``
 
-Currently required for all features of NannyML, though we are looking to drop this requirement in a future release.
+
+.. warning::
+    This column is optional. When a timestamp column is not provided, plots will no longer make use of a time based x-axis
+    but will use the index of the chunks instead. The following plots illustrate this:
+
+    .. figure:: /_static/drift-guide-salary_range.svg
+
+        Plot using a time based X-axis
+
+
+    .. figure:: /_static/quick-start-drift-salary_range.svg
+
+        Plot using an index based X-axis
+
+
+    Some :class:`~nannyml.chunk.Chunker` classes might require the presence of a timestamp, such as the
+    :class:`~nannyml.chunk.PeriodBasedChunker`.
+
 
 Target
 ^^^^^^
@@ -183,7 +202,7 @@ You can see those requirements in the table below:
 +--------------+-------------------------------------+-------------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+
 | Data         | Performance Estimation              | Realized Performance                | Univariate Feature Drift          | Multivariate Feature Drift        | Target Drift                      | Output Drift                      |
 +==============+=====================================+=====================================+===================================+===================================+===================================+===================================+
-| timestamp    | Required (reference and analysis)   | Required (reference and analysis)   | Required (reference and analysis) | Required (reference and analysis) | Required (reference and analysis) | Required (reference and analysis) |
+| timestamp    |                                     |                                     |                                   |                                   |                                   |                                   |
 +--------------+-------------------------------------+-------------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+
 | features     |                                     |                                     | Required (reference and analysis) | Required (reference and analysis) |                                   |                                   |
 +--------------+-------------------------------------+-------------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+

diff --git a/...drift/model_outputs/drift_detection_for_binary_classification_model_outputs.rst b/...drift/model_outputs/drift_detection_for_binary_classification_model_outputs.rst
@@ -13,6 +13,12 @@ If the model's population changes, then its actions will be different.
 The difference in actions is very important to know as soon as possible because
 they directly affect the business results from operating a machine learning model.
 
+.. note::
+    The following example uses :term:`timestamps<Timestamp>`.
+    These are optional but have an impact on the way data is chunked and results are plotted.
+    You can read more about them in the :ref:`data requirements<data_requirements_columns_timestamp>`.
+
+
 Just The Code
 ------------------------------------
 

diff --git a/...t/model_outputs/drift_detection_for_multiclass_classification_model_outputs.rst b/...t/model_outputs/drift_detection_for_multiclass_classification_model_outputs.rst
@@ -13,6 +13,12 @@ If the model's population changes, then our populations' actions will be differe
 The difference in actions is very important to know as soon as possible because
 they directly affect the business results from operating a machine learning model.
 
+.. note::
+    The following example uses :term:`timestamps<Timestamp>`.
+    These are optional but have an impact on the way data is chunked and results are plotted.
+    You can read more about them in the :ref:`data requirements<data_requirements_columns_timestamp>`.
+
+
 
 Just The Code
 ------------------------------------

diff --git a/...cting_data_drift/model_outputs/drift_detection_for_regression_model_outputs.rst b/...cting_data_drift/model_outputs/drift_detection_for_regression_model_outputs.rst
@@ -13,6 +13,12 @@ If the model's population changes, then the outcome will be different.
 The difference in actions is very important to know as soon as possible because
 they directly affect the business results from operating a machine learning model.
 
+.. note::
+    The following example uses :term:`timestamps<Timestamp>`.
+    These are optional but have an impact on the way data is chunked and results are plotted.
+    You can read more about them in the :ref:`data requirements<data_requirements_columns_timestamp>`.
+
+
 
 Just The Code
 -------------

diff --git a/...drift/model_targets/drift_detection_for_binary_classification_model_targets.rst b/...drift/model_targets/drift_detection_for_binary_classification_model_targets.rst
@@ -23,6 +23,12 @@ of the available target values for each chunk, for both binary and multiclass cl
 .. note::
     The Target Drift detection process can handle missing target values across all :term:`data periods<Data Period>`.
 
+.. note::
+    The following example uses :term:`timestamps<Timestamp>`.
+    These are optional but have an impact on the way data is chunked and results are plotted.
+    You can read more about them in the :ref:`data requirements<data_requirements_columns_timestamp>`.
+
+
 
 Just The Code
 ------------------------------------

diff --git a/...t/model_targets/drift_detection_for_multiclass_classification_model_targets.rst b/...t/model_targets/drift_detection_for_multiclass_classification_model_targets.rst
@@ -23,6 +23,12 @@ of the available target values for each chunk, for both binary and multiclass cl
 .. note::
     The Target Drift detection process can handle missing target values across all :term:`data periods<Data Period>`.
 
+.. note::
+    The following example uses :term:`timestamps<Timestamp>`.
+    These are optional but have an impact on the way data is chunked and results are plotted.
+    You can read more about them in the :ref:`data requirements<data_requirements_columns_timestamp>`.
+
+
 
 Just The Code
 ------------------------------------

diff --git a/...cting_data_drift/model_targets/drift_detection_for_regression_model_targets.rst b/...cting_data_drift/model_targets/drift_detection_for_regression_model_targets.rst
@@ -21,6 +21,12 @@ but also show the target distribution results per chunk with joyploys.
 .. note::
     The Target Drift detection process can handle missing target values across all :term:`data periods<Data Period>`.
 
+.. note::
+    The following example uses :term:`timestamps<Timestamp>`.
+    These are optional but have an impact on the way data is chunked and results are plotted.
+    You can read more about them in the :ref:`data requirements<data_requirements_columns_timestamp>`.
+
+
 
 Just The Code
 -------------

diff --git a/docs/tutorials/performance_calculation/binary_performance_calculation.rst b/docs/tutorials/performance_calculation/binary_performance_calculation.rst
@@ -4,6 +4,12 @@
 Monitoring Realized Performance for Binary Classification
 ================================================================
 
+.. note::
+    The following example uses :term:`timestamps<Timestamp>`.
+    These are optional but have an impact on the way data is chunked and results are plotted.
+    You can read more about them in the :ref:`data requirements<data_requirements_columns_timestamp>`.
+
+
 Just The Code
 ==============
 

diff --git a/docs/tutorials/performance_calculation/multiclass_performance_calculation.rst b/docs/tutorials/performance_calculation/multiclass_performance_calculation.rst
@@ -4,6 +4,12 @@
 Monitoring Realized Performance for Multiclass Classification
 ================================================================
 
+.. note::
+    The following example uses :term:`timestamps<Timestamp>`.
+    These are optional but have an impact on the way data is chunked and results are plotted.
+    You can read more about them in the :ref:`data requirements<data_requirements_columns_timestamp>`.
+
+
 
 Just The Code
 ==============

diff --git a/docs/tutorials/performance_calculation/regression_performance_calculation.rst b/docs/tutorials/performance_calculation/regression_performance_calculation.rst
@@ -4,6 +4,12 @@
 Monitoring Realized Performance for Regression
 ==============================================
 
+.. note::
+    The following example uses :term:`timestamps<Timestamp>`.
+    These are optional but have an impact on the way data is chunked and results are plotted.
+    You can read more about them in the :ref:`data requirements<data_requirements_columns_timestamp>`.
+
+
 Just The Code
 =============
 

diff --git a/docs/tutorials/performance_estimation/binary_performance_estimation.rst b/docs/tutorials/performance_estimation/binary_performance_estimation.rst
@@ -8,6 +8,12 @@ This tutorial explains how to use NannyML to estimate the performance of binary
 models in the absence of target data. To find out how CBPE estimates performance, read the :ref:`explanation of Confidence-based
 Performance Estimation<performance-estimation-deep-dive>`.
 
+.. note::
+    The following example uses :term:`timestamps<Timestamp>`.
+    These are optional but have an impact on the way data is chunked and results are plotted.
+    You can read more about them in the :ref:`data requirements<data_requirements_columns_timestamp>`.
+
+
 
 .. _performance-estimation-binary-just-the-code:
 

diff --git a/docs/tutorials/performance_estimation/multiclass_performance_estimation.rst b/docs/tutorials/performance_estimation/multiclass_performance_estimation.rst
@@ -8,6 +8,12 @@ This tutorial explains how to use NannyML to estimate the performance of multicl
 models in the absence of target data. To find out how CBPE estimates performance, read the :ref:`explanation of Confidence-based
 Performance Estimation<performance-estimation-deep-dive>`.
 
+.. note::
+    The following example uses :term:`timestamps<Timestamp>`.
+    These are optional but have an impact on the way data is chunked and results are plotted.
+    You can read more about them in the :ref:`data requirements<data_requirements_columns_timestamp>`.
+
+
 Just The Code
 -------------
 

diff --git a/docs/tutorials/performance_estimation/regression_performance_estimation.rst b/docs/tutorials/performance_estimation/regression_performance_estimation.rst
@@ -8,6 +8,12 @@ This tutorial explains how to use NannyML to estimate the performance of regress
 models in the absence of target data. To find out how DLE estimates performance,
 read the :ref:`explanation of how Direct Loss Estimation works<how-it-works-dle>`.
 
+.. note::
+    The following example uses :term:`timestamps<Timestamp>`.
+    These are optional but have an impact on the way data is chunked and results are plotted.
+    You can read more about them in the :ref:`data requirements<data_requirements_columns_timestamp>`.
+
+
 .. _performance-estimation-regression-just-the-code:
 
 Just The Code

diff --git a/nannyml/base.py b/nannyml/base.py
@@ -66,6 +66,7 @@ def __init__(
         chunk_number: int = None,
         chunk_period: str = None,
         chunker: Chunker = None,
+        timestamp_column_name: Optional[str] = None,
     ):
         """Creates a new instance of an abstract DriftCalculator.
 
@@ -83,7 +84,11 @@ def __init__(
         chunker : Chunker
             The `Chunker` used to split the data sets into a lists of chunks.
         """
-        self.chunker = ChunkerFactory.get_chunker(chunk_size, chunk_number, chunk_period, chunker)
+        self.chunker = ChunkerFactory.get_chunker(
+            chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name
+        )
+
+        self.timestamp_column_name = timestamp_column_name
 
     @property
     def _logger(self) -> logging.Logger:
@@ -167,6 +172,7 @@ def __init__(
         chunk_number: int = None,
         chunk_period: str = None,
         chunker: Chunker = None,
+        timestamp_column_name: str = None,
     ):
         """Creates a new instance of an abstract DriftCalculator.
 
@@ -184,7 +190,10 @@ def __init__(
         chunker : Chunker
             The `Chunker` used to split the data sets into a lists of chunks.
         """
-        self.chunker = ChunkerFactory.get_chunker(chunk_size, chunk_number, chunk_period, chunker)
+        self.chunker = ChunkerFactory.get_chunker(
+            chunk_size, chunk_number, chunk_period, chunker, timestamp_column_name
+        )
+        self.timestamp_column_name = timestamp_column_name
 
     @property
     def _logger(self) -> logging.Logger: