From 21323eb65877436a303ca0befe10304df305953f Mon Sep 17 00:00:00 2001
From: umar <46414488+mail4umar@users.noreply.github.com>
Date: Tue, 24 Oct 2023 04:28:55 -0500
Subject: [PATCH 1/4] Bugfix _ Isolation Forest Anomaly Plot
---
verticapy/machine_learning/vertica/cluster.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/verticapy/machine_learning/vertica/cluster.py b/verticapy/machine_learning/vertica/cluster.py
index 9c2a15be6..b679a49a6 100755
--- a/verticapy/machine_learning/vertica/cluster.py
+++ b/verticapy/machine_learning/vertica/cluster.py
@@ -176,7 +176,7 @@ def plot(
**style_kwargs,
}
if self._model_subcategory == "ANOMALY_DETECTION":
- fun = vdf.bubble
+ fun = vdf.scatter
name = "anomaly_score"
kwargs["cmap_col"] = name
else:
From b855761e319b6216adb93e490f22654160cd4df8 Mon Sep 17 00:00:00 2001
From: umar <46414488+mail4umar@users.noreply.github.com>
Date: Tue, 24 Oct 2023 04:47:12 -0500
Subject: [PATCH 2/4] Sphinx Docstring - ML/Vertica/Isolation Forest
---
.../machine_learning/vertica/ensemble.py | 302 ++++++++++++++++++
1 file changed, 302 insertions(+)
diff --git a/verticapy/machine_learning/vertica/ensemble.py b/verticapy/machine_learning/vertica/ensemble.py
index f43958539..ac9fb6c45 100755
--- a/verticapy/machine_learning/vertica/ensemble.py
+++ b/verticapy/machine_learning/vertica/ensemble.py
@@ -1087,6 +1087,308 @@ class IsolationForest(Clustering, Tree):
Float in the range (0,1] that specifies the
fraction of columns (features), chosen at random,
used when building each tree.
+
+ Examples
+ ---------
+
+ The following examples provide a basic understanding of usage.
+ For more detailed examples, please refer to the
+ :ref:`user_guide.machine_learning` or the
+ `Examples `_
+ section on the website.
+
+ .. important::
+
+ Many anomaly detection models inherit from the ``Clustering``
+ base class, and it's recommended to use it directly for
+ access to a wider range of options.
+
+ Load data for machine learning
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ We import ``verticapy``:
+
+ .. ipython:: python
+
+ import verticapy as vp
+
+ .. hint::
+
+ By assigning an alias to ``verticapy``, we mitigate the risk of code
+ collisions with other libraries. This precaution is necessary
+ because verticapy uses commonly known function names like "average"
+ and "median", which can potentially lead to naming conflicts.
+ The use of an alias ensures that the functions from verticapy are
+ used as intended without interfering with functions from other
+ libraries.
+
+ For this example, we will use the winequality dataset.
+
+ .. code-block:: python
+
+ import verticapy.datasets as vpd
+
+ data = vpd.load_winequality()
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html
+
+ .. note::
+
+ VerticaPy offers a wide range of sample datasets that are
+ ideal for training and testing purposes. You can explore
+ the full list of available datasets in the :ref:`api.datasets`,
+ which provides detailed information on each dataset
+ and how to use them effectively. These datasets are invaluable
+ resources for honing your data analysis and machine learning
+ skills within the VerticaPy environment.
+
+ .. ipython:: python
+ :suppress:
+
+ import verticapy.datasets as vpd
+ data = vpd.load_winequality()
+
+ Model Initialization
+ ^^^^^^^^^^^^^^^^^^^^^
+
+ First we import the ``IsolationForest`` model:
+
+ .. code-block::
+
+ from verticapy.machine_learning.vertica import IsolationForest
+
+ .. ipython:: python
+ :suppress:
+
+ from verticapy.machine_learning.vertica import IsolationForest
+
+ Then we can create the model:
+
+ .. ipython:: python
+ :okwarning:
+
+ model = IsolationForest(
+ n_estimators = 10,
+ max_depth = 3,
+ nbins = 6,
+ )
+
+ .. hint::
+
+ In ``verticapy`` 1.0.x and higher, you do not need to specify the
+ model name, as the name is automatically assigned. If you need to
+ re-use the model, you can fetch the model name from the model's
+ attributes.
+
+ .. important::
+
+ The model name is crucial for the model management system and
+ versioning. It's highly recommended to provide a name if you
+ plan to reuse the model later.
+
+ Model Training
+ ^^^^^^^^^^^^^^^
+
+ We can now fit the model:
+
+ .. ipython:: python
+ :okwarning:
+
+ model.fit(data, X = ["density", "sulphates"])
+
+ .. important::
+
+ To train a model, you can directly use the ``vDataFrame`` or the
+ name of the relation stored in the database. The test set is optional
+ and is only used to compute the test metrics. In ``verticapy``, we
+ don't work using ``X`` matrices and ``y`` vectors. Instead, we work
+ directly with lists of predictors and the response name.
+
+
+ Prediction
+ ^^^^^^^^^^^
+
+ Prediction is straight-forward:
+
+ .. ipython:: python
+ :suppress:
+
+ result = model.predict(data["density", "sulphates"])
+ html_file = open("figures/machine_learning_vertica_isolation_for_prediction.html", "w")
+ html_file.write(result._repr_html_())
+ html_file.close()
+
+ .. code-block:: python
+
+ model.predict(data["density", "sulphates"])
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_isolation_for_prediction.html
+
+
+
+ Plots - Anomaly Detection
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ Plots highlighting the outliers can be easily drawn using:
+
+
+ .. code-block:: python
+
+ model.plot()
+
+ .. ipython:: python
+ :suppress:
+
+ vp.set_option("plotting_lib", "plotly")
+ fig = model.plot()
+ fig.write_html("figures/machine_learning_vertica_isolation_for_plot.html")
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_isolation_for_plot.html
+
+
+ Plots - Tree
+ ^^^^^^^^^^^^
+
+ Tree models can be visualized by drawing their tree plots.
+ For more examples, check out :ref:`chart_gallery.tree`.
+
+ .. code-block:: python
+
+ model.plot_tree()
+
+ .. ipython:: python
+ :suppress:
+
+ res = model.plot_tree()
+ res.render(filename='figures/machine_learning_vertica_tree_isolation_for_', format='png')
+
+ .. image:: /../figures/machine_learning_vertica_tree_isolation_for_.png
+
+ .. note::
+
+ The above example may not render properly in the doc because
+ of the huge size of the tree. But it should render nicely
+ in jupyter environment.
+
+ In order to plot graph using `graphviz `_
+ separately, you can extract the graphviz DOT file code as follows:
+
+ .. ipython:: python
+
+ model.to_graphviz()
+
+ This string can then be copied into a DOT file which can be
+ parsed by graphviz.
+
+
+ Plots - Contour
+ ^^^^^^^^^^^^^^^^
+
+ In order to understand the parameter space, we can also look
+ at the contour plots:
+
+ .. code-block:: python
+
+ model.contour()
+
+ .. ipython:: python
+ :suppress:
+
+ fig = model.contour()
+ fig.write_html("figures/machine_learning_vertica_isolation_for_contour.html")
+
+ .. raw:: html
+ :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_isolation_for_contour.html
+
+ .. note::
+
+ This plot is only possible for predictors less than 3.
+
+
+ Parameter Modification
+ ^^^^^^^^^^^^^^^^^^^^^^^
+
+ In order to see the parameters:
+
+ .. ipython:: python
+
+ model.get_params()
+
+ And to manually change some of the parameters:
+
+ .. ipython:: python
+
+ model.set_params({'max_depth': 5})
+
+ Model Register
+ ^^^^^^^^^^^^^^
+
+ In order to register the model for tracking and versioning:
+
+ .. code-block:: python
+
+ model.register("model_v1")
+
+ Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html`
+ for more details on model tracking and versioning.
+
+ Model Exporting
+ ^^^^^^^^^^^^^^^^
+
+ **To Memmodel**
+
+ .. code-block:: python
+
+ model.to_memmodel()
+
+ .. note::
+
+ ``MemModel`` objects serve as in-memory representations of machine
+ learning models. They can be used for both in-database and in-memory
+ prediction tasks. These objects can be pickled in the same way that
+ you would pickle a ``scikit-learn`` model.
+
+ The preceding methods for exporting the model use ``MemModel``, and it
+ is recommended to use ``MemModel`` directly.
+
+ **To SQL**
+
+ You can get the SQL query equivalent of the XGB model by:
+
+ .. ipython:: python
+
+ model.to_sql()
+
+ .. note:: This SQL query can be directly used in any database.
+
+ **Deploy SQL**
+
+ To get the SQL query which uses Vertica functions use below:
+
+ .. ipython:: python
+
+ model.deploySQL()
+
+ **To Python**
+
+ To obtain the prediction function in Python syntax, use the following code:
+
+ .. ipython:: python
+
+ X = [[0.9, 0.5]]
+ model.to_python()(X)
+
+ .. hint::
+
+ The
+ :py:mod:`verticapy.machine_learning.vertica.tree.IsolationForest.to_python`
+ method is used to retrieve the anomaly score.
+ For specific details on how to
+ use this method for different model types, refer to the relevant
+ documentation for each model.
"""
# Properties.
From 8b7be8896b57772a82691d7941c4a80b5a7a15f4 Mon Sep 17 00:00:00 2001
From: umar <46414488+mail4umar@users.noreply.github.com>
Date: Tue, 24 Oct 2023 04:54:14 -0500
Subject: [PATCH 3/4] fixed black
---
verticapy/machine_learning/vertica/ensemble.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/verticapy/machine_learning/vertica/ensemble.py b/verticapy/machine_learning/vertica/ensemble.py
index ac9fb6c45..aa933b0c2 100755
--- a/verticapy/machine_learning/vertica/ensemble.py
+++ b/verticapy/machine_learning/vertica/ensemble.py
@@ -1087,7 +1087,7 @@ class IsolationForest(Clustering, Tree):
Float in the range (0,1] that specifies the
fraction of columns (features), chosen at random,
used when building each tree.
-
+
Examples
---------
From cf52c19c2856b22b93354a889915e5f7c27f1db9 Mon Sep 17 00:00:00 2001
From: Badr
Date: Tue, 24 Oct 2023 07:43:49 -0400
Subject: [PATCH 4/4] corrections
---
.../machine_learning/vertica/ensemble.py | 42 +++++++++++--------
.../machine_learning/vertica/linear_model.py | 1 +
2 files changed, 25 insertions(+), 18 deletions(-)
diff --git a/verticapy/machine_learning/vertica/ensemble.py b/verticapy/machine_learning/vertica/ensemble.py
index aa933b0c2..41e3496cd 100755
--- a/verticapy/machine_learning/vertica/ensemble.py
+++ b/verticapy/machine_learning/vertica/ensemble.py
@@ -1097,14 +1097,8 @@ class IsolationForest(Clustering, Tree):
`Examples `_
section on the website.
- .. important::
-
- Many anomaly detection models inherit from the ``Clustering``
- base class, and it's recommended to use it directly for
- access to a wider range of options.
-
Load data for machine learning
- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
We import ``verticapy``:
@@ -1205,6 +1199,13 @@ class IsolationForest(Clustering, Tree):
don't work using ``X`` matrices and ``y`` vectors. Instead, we work
directly with lists of predictors and the response name.
+ .. hint::
+
+ For clustering and anomaly detection, the use of predictors is
+ optional. In such cases, all available predictors are considered,
+ which can include solely numerical variables or a combination of
+ numerical and categorical variables, depending on the model's
+ capabilities.
Prediction
^^^^^^^^^^^
@@ -1214,26 +1215,23 @@ class IsolationForest(Clustering, Tree):
.. ipython:: python
:suppress:
- result = model.predict(data["density", "sulphates"])
+ result = model.predict(data, ["density", "sulphates"])
html_file = open("figures/machine_learning_vertica_isolation_for_prediction.html", "w")
html_file.write(result._repr_html_())
html_file.close()
.. code-block:: python
- model.predict(data["density", "sulphates"])
+ model.predict(data, ["density", "sulphates"])
.. raw:: html
:file: SPHINX_DIRECTORY/figures/machine_learning_vertica_isolation_for_prediction.html
-
-
Plots - Anomaly Detection
- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^
Plots highlighting the outliers can be easily drawn using:
-
.. code-block:: python
model.plot()
@@ -1248,9 +1246,15 @@ class IsolationForest(Clustering, Tree):
.. raw:: html
:file: SPHINX_DIRECTORY/figures/machine_learning_vertica_isolation_for_plot.html
+ .. note::
+
+ Most anomaly detection methods produce a score. In scenarios involving
+ 2 or 3 predictors, using a bubble plot to visualize the model's results
+ is a straightforward approach. In such plots, the size of each bubble
+ corresponds to the anomaly score.
Plots - Tree
- ^^^^^^^^^^^^
+ ^^^^^^^^^^^^^
Tree models can be visualized by drawing their tree plots.
For more examples, check out :ref:`chart_gallery.tree`.
@@ -1283,7 +1287,6 @@ class IsolationForest(Clustering, Tree):
This string can then be copied into a DOT file which can be
parsed by graphviz.
-
Plots - Contour
^^^^^^^^^^^^^^^^
@@ -1305,9 +1308,12 @@ class IsolationForest(Clustering, Tree):
.. note::
- This plot is only possible for predictors less than 3.
-
-
+ Machine learning models with two predictors can usually benefit
+ from their own contour plot. This visual representation aids in
+ exploring predictions and gaining a deeper understanding of how
+ these models perform in different scenarios. Please refer to
+ :ref:`chart_gallery.contour_plot` for more examples.
+
Parameter Modification
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/verticapy/machine_learning/vertica/linear_model.py b/verticapy/machine_learning/vertica/linear_model.py
index 554a7a98f..5571830cc 100755
--- a/verticapy/machine_learning/vertica/linear_model.py
+++ b/verticapy/machine_learning/vertica/linear_model.py
@@ -324,6 +324,7 @@ class ElasticNet(Regressor, LinearModel):
in training the model. Note that setting
fit_intercept to false does not work well with
the BFGS optimizer.
+
Examples
---------