From 21323eb65877436a303ca0befe10304df305953f Mon Sep 17 00:00:00 2001 From: umar <46414488+mail4umar@users.noreply.github.com> Date: Tue, 24 Oct 2023 04:28:55 -0500 Subject: [PATCH 1/4] Bugfix _ Isolation Forest Anomaly Plot --- verticapy/machine_learning/vertica/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/verticapy/machine_learning/vertica/cluster.py b/verticapy/machine_learning/vertica/cluster.py index 9c2a15be6..b679a49a6 100755 --- a/verticapy/machine_learning/vertica/cluster.py +++ b/verticapy/machine_learning/vertica/cluster.py @@ -176,7 +176,7 @@ def plot( **style_kwargs, } if self._model_subcategory == "ANOMALY_DETECTION": - fun = vdf.bubble + fun = vdf.scatter name = "anomaly_score" kwargs["cmap_col"] = name else: From b855761e319b6216adb93e490f22654160cd4df8 Mon Sep 17 00:00:00 2001 From: umar <46414488+mail4umar@users.noreply.github.com> Date: Tue, 24 Oct 2023 04:47:12 -0500 Subject: [PATCH 2/4] Sphinx Docstring - ML/Vertica/Isolation Forest --- .../machine_learning/vertica/ensemble.py | 302 ++++++++++++++++++ 1 file changed, 302 insertions(+) diff --git a/verticapy/machine_learning/vertica/ensemble.py b/verticapy/machine_learning/vertica/ensemble.py index f43958539..ac9fb6c45 100755 --- a/verticapy/machine_learning/vertica/ensemble.py +++ b/verticapy/machine_learning/vertica/ensemble.py @@ -1087,6 +1087,308 @@ class IsolationForest(Clustering, Tree): Float in the range (0,1] that specifies the fraction of columns (features), chosen at random, used when building each tree. + + Examples + --------- + + The following examples provide a basic understanding of usage. + For more detailed examples, please refer to the + :ref:`user_guide.machine_learning` or the + `Examples `_ + section on the website. + + .. important:: + + Many anomaly detection models inherit from the ``Clustering`` + base class, and it's recommended to use it directly for + access to a wider range of options. + + Load data for machine learning + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + We import ``verticapy``: + + .. ipython:: python + + import verticapy as vp + + .. hint:: + + By assigning an alias to ``verticapy``, we mitigate the risk of code + collisions with other libraries. This precaution is necessary + because verticapy uses commonly known function names like "average" + and "median", which can potentially lead to naming conflicts. + The use of an alias ensures that the functions from verticapy are + used as intended without interfering with functions from other + libraries. + + For this example, we will use the winequality dataset. + + .. code-block:: python + + import verticapy.datasets as vpd + + data = vpd.load_winequality() + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html + + .. note:: + + VerticaPy offers a wide range of sample datasets that are + ideal for training and testing purposes. You can explore + the full list of available datasets in the :ref:`api.datasets`, + which provides detailed information on each dataset + and how to use them effectively. These datasets are invaluable + resources for honing your data analysis and machine learning + skills within the VerticaPy environment. + + .. ipython:: python + :suppress: + + import verticapy.datasets as vpd + data = vpd.load_winequality() + + Model Initialization + ^^^^^^^^^^^^^^^^^^^^^ + + First we import the ``IsolationForest`` model: + + .. code-block:: + + from verticapy.machine_learning.vertica import IsolationForest + + .. ipython:: python + :suppress: + + from verticapy.machine_learning.vertica import IsolationForest + + Then we can create the model: + + .. ipython:: python + :okwarning: + + model = IsolationForest( + n_estimators = 10, + max_depth = 3, + nbins = 6, + ) + + .. hint:: + + In ``verticapy`` 1.0.x and higher, you do not need to specify the + model name, as the name is automatically assigned. If you need to + re-use the model, you can fetch the model name from the model's + attributes. + + .. important:: + + The model name is crucial for the model management system and + versioning. It's highly recommended to provide a name if you + plan to reuse the model later. + + Model Training + ^^^^^^^^^^^^^^^ + + We can now fit the model: + + .. ipython:: python + :okwarning: + + model.fit(data, X = ["density", "sulphates"]) + + .. important:: + + To train a model, you can directly use the ``vDataFrame`` or the + name of the relation stored in the database. The test set is optional + and is only used to compute the test metrics. In ``verticapy``, we + don't work using ``X`` matrices and ``y`` vectors. Instead, we work + directly with lists of predictors and the response name. + + + Prediction + ^^^^^^^^^^^ + + Prediction is straight-forward: + + .. ipython:: python + :suppress: + + result = model.predict(data["density", "sulphates"]) + html_file = open("figures/machine_learning_vertica_isolation_for_prediction.html", "w") + html_file.write(result._repr_html_()) + html_file.close() + + .. code-block:: python + + model.predict(data["density", "sulphates"]) + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_isolation_for_prediction.html + + + + Plots - Anomaly Detection + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + Plots highlighting the outliers can be easily drawn using: + + + .. code-block:: python + + model.plot() + + .. ipython:: python + :suppress: + + vp.set_option("plotting_lib", "plotly") + fig = model.plot() + fig.write_html("figures/machine_learning_vertica_isolation_for_plot.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_isolation_for_plot.html + + + Plots - Tree + ^^^^^^^^^^^^ + + Tree models can be visualized by drawing their tree plots. + For more examples, check out :ref:`chart_gallery.tree`. + + .. code-block:: python + + model.plot_tree() + + .. ipython:: python + :suppress: + + res = model.plot_tree() + res.render(filename='figures/machine_learning_vertica_tree_isolation_for_', format='png') + + .. image:: /../figures/machine_learning_vertica_tree_isolation_for_.png + + .. note:: + + The above example may not render properly in the doc because + of the huge size of the tree. But it should render nicely + in jupyter environment. + + In order to plot graph using `graphviz `_ + separately, you can extract the graphviz DOT file code as follows: + + .. ipython:: python + + model.to_graphviz() + + This string can then be copied into a DOT file which can be + parsed by graphviz. + + + Plots - Contour + ^^^^^^^^^^^^^^^^ + + In order to understand the parameter space, we can also look + at the contour plots: + + .. code-block:: python + + model.contour() + + .. ipython:: python + :suppress: + + fig = model.contour() + fig.write_html("figures/machine_learning_vertica_isolation_for_contour.html") + + .. raw:: html + :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_isolation_for_contour.html + + .. note:: + + This plot is only possible for predictors less than 3. + + + Parameter Modification + ^^^^^^^^^^^^^^^^^^^^^^^ + + In order to see the parameters: + + .. ipython:: python + + model.get_params() + + And to manually change some of the parameters: + + .. ipython:: python + + model.set_params({'max_depth': 5}) + + Model Register + ^^^^^^^^^^^^^^ + + In order to register the model for tracking and versioning: + + .. code-block:: python + + model.register("model_v1") + + Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html` + for more details on model tracking and versioning. + + Model Exporting + ^^^^^^^^^^^^^^^^ + + **To Memmodel** + + .. code-block:: python + + model.to_memmodel() + + .. note:: + + ``MemModel`` objects serve as in-memory representations of machine + learning models. They can be used for both in-database and in-memory + prediction tasks. These objects can be pickled in the same way that + you would pickle a ``scikit-learn`` model. + + The preceding methods for exporting the model use ``MemModel``, and it + is recommended to use ``MemModel`` directly. + + **To SQL** + + You can get the SQL query equivalent of the XGB model by: + + .. ipython:: python + + model.to_sql() + + .. note:: This SQL query can be directly used in any database. + + **Deploy SQL** + + To get the SQL query which uses Vertica functions use below: + + .. ipython:: python + + model.deploySQL() + + **To Python** + + To obtain the prediction function in Python syntax, use the following code: + + .. ipython:: python + + X = [[0.9, 0.5]] + model.to_python()(X) + + .. hint:: + + The + :py:mod:`verticapy.machine_learning.vertica.tree.IsolationForest.to_python` + method is used to retrieve the anomaly score. + For specific details on how to + use this method for different model types, refer to the relevant + documentation for each model. """ # Properties. From 8b7be8896b57772a82691d7941c4a80b5a7a15f4 Mon Sep 17 00:00:00 2001 From: umar <46414488+mail4umar@users.noreply.github.com> Date: Tue, 24 Oct 2023 04:54:14 -0500 Subject: [PATCH 3/4] fixed black --- verticapy/machine_learning/vertica/ensemble.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/verticapy/machine_learning/vertica/ensemble.py b/verticapy/machine_learning/vertica/ensemble.py index ac9fb6c45..aa933b0c2 100755 --- a/verticapy/machine_learning/vertica/ensemble.py +++ b/verticapy/machine_learning/vertica/ensemble.py @@ -1087,7 +1087,7 @@ class IsolationForest(Clustering, Tree): Float in the range (0,1] that specifies the fraction of columns (features), chosen at random, used when building each tree. - + Examples --------- From cf52c19c2856b22b93354a889915e5f7c27f1db9 Mon Sep 17 00:00:00 2001 From: Badr Date: Tue, 24 Oct 2023 07:43:49 -0400 Subject: [PATCH 4/4] corrections --- .../machine_learning/vertica/ensemble.py | 42 +++++++++++-------- .../machine_learning/vertica/linear_model.py | 1 + 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/verticapy/machine_learning/vertica/ensemble.py b/verticapy/machine_learning/vertica/ensemble.py index aa933b0c2..41e3496cd 100755 --- a/verticapy/machine_learning/vertica/ensemble.py +++ b/verticapy/machine_learning/vertica/ensemble.py @@ -1097,14 +1097,8 @@ class IsolationForest(Clustering, Tree): `Examples `_ section on the website. - .. important:: - - Many anomaly detection models inherit from the ``Clustering`` - base class, and it's recommended to use it directly for - access to a wider range of options. - Load data for machine learning - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ We import ``verticapy``: @@ -1205,6 +1199,13 @@ class IsolationForest(Clustering, Tree): don't work using ``X`` matrices and ``y`` vectors. Instead, we work directly with lists of predictors and the response name. + .. hint:: + + For clustering and anomaly detection, the use of predictors is + optional. In such cases, all available predictors are considered, + which can include solely numerical variables or a combination of + numerical and categorical variables, depending on the model's + capabilities. Prediction ^^^^^^^^^^^ @@ -1214,26 +1215,23 @@ class IsolationForest(Clustering, Tree): .. ipython:: python :suppress: - result = model.predict(data["density", "sulphates"]) + result = model.predict(data, ["density", "sulphates"]) html_file = open("figures/machine_learning_vertica_isolation_for_prediction.html", "w") html_file.write(result._repr_html_()) html_file.close() .. code-block:: python - model.predict(data["density", "sulphates"]) + model.predict(data, ["density", "sulphates"]) .. raw:: html :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_isolation_for_prediction.html - - Plots - Anomaly Detection - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^^^^^^^ Plots highlighting the outliers can be easily drawn using: - .. code-block:: python model.plot() @@ -1248,9 +1246,15 @@ class IsolationForest(Clustering, Tree): .. raw:: html :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_isolation_for_plot.html + .. note:: + + Most anomaly detection methods produce a score. In scenarios involving + 2 or 3 predictors, using a bubble plot to visualize the model's results + is a straightforward approach. In such plots, the size of each bubble + corresponds to the anomaly score. Plots - Tree - ^^^^^^^^^^^^ + ^^^^^^^^^^^^^ Tree models can be visualized by drawing their tree plots. For more examples, check out :ref:`chart_gallery.tree`. @@ -1283,7 +1287,6 @@ class IsolationForest(Clustering, Tree): This string can then be copied into a DOT file which can be parsed by graphviz. - Plots - Contour ^^^^^^^^^^^^^^^^ @@ -1305,9 +1308,12 @@ class IsolationForest(Clustering, Tree): .. note:: - This plot is only possible for predictors less than 3. - - + Machine learning models with two predictors can usually benefit + from their own contour plot. This visual representation aids in + exploring predictions and gaining a deeper understanding of how + these models perform in different scenarios. Please refer to + :ref:`chart_gallery.contour_plot` for more examples. + Parameter Modification ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/verticapy/machine_learning/vertica/linear_model.py b/verticapy/machine_learning/vertica/linear_model.py index 554a7a98f..5571830cc 100755 --- a/verticapy/machine_learning/vertica/linear_model.py +++ b/verticapy/machine_learning/vertica/linear_model.py @@ -324,6 +324,7 @@ class ElasticNet(Regressor, LinearModel): in training the model. Note that setting fit_intercept to false does not work well with the BFGS optimizer. + Examples ---------