vertica · mail4umar · Oct 25, 2023 · Oct 24, 2023 · Oct 25, 2023 · Oct 25, 2023
diff --git a/verticapy/machine_learning/vertica/cluster.py b/verticapy/machine_learning/vertica/cluster.py
@@ -546,10 +546,13 @@ class BisectingKMeans(KMeans, Tree):
     split_method: str, optional
         The method used to choose a cluster to
         bisect/split.
-            size        : Choose the largest cluster
-                          to bisect.
-            sum_squares : Choose the cluster with the
-                          largest withInSS to bisect.
+
+        - size:
+            Choose the largest cluster to bisect.
+        - sum_squares:
+            Choose the cluster with the largest
+            withInSS to bisect.
+
     min_divisible_cluster_size: int, optional
         The minimum number of points of a divisible
         cluster. Must be greater than or equal to 2.
@@ -560,12 +563,15 @@ class BisectingKMeans(KMeans, Tree):
     init: str / list, optional
         The method used to find the initial KMeans
         cluster centers.
-            kmeanspp : Uses  the KMeans++ method  to
-                       initialize the centers.
-            pseudo   : Uses "pseudo center" approach
-                       used by Spark,  bisects given
-                       center without iterating over
-                       points.
+
+         - kmeanspp:
+            Uses  the KMeans++ method  to initialize
+            the centers.
+         - pseudo:
+            Uses "pseudo center" approach used by
+            Spark,  bisects given center without iterating
+            over points.
+
         You can also provide a list with the initial
         cluster centers.
     max_iter: int, optional
@@ -577,6 +583,353 @@ class BisectingKMeans(KMeans, Tree):
         converged  after  no center has moved  more
         than a distance of  'tol' from the previous
         iteration.
+
+    Examples
+    ---------
+
+    The following examples provide a basic understanding of usage.
+    For more detailed examples, please refer to the
+    :ref:`user_guide.machine_learning` or the
+    `Examples <https://www.vertica.com/python/examples/>`_
+    section on the website.
+
+    Load data for machine learning
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+    We import ``verticapy``:
+
+    .. ipython:: python
+
+        import verticapy as vp
+
+    .. hint::
+
+        By assigning an alias to ``verticapy``, we mitigate the risk of code
+        collisions with other libraries. This precaution is necessary
+        because verticapy uses commonly known function names like "average"
+        and "median", which can potentially lead to naming conflicts.
+        The use of an alias ensures that the functions from verticapy are
+        used as intended without interfering with functions from other
+        libraries.
+
+    For this example, we will use the winequality dataset.
+
+    .. code-block:: python
+
+        import verticapy.datasets as vpd
+
+        data = vpd.load_winequality()
+
+    .. raw:: html
+        :file: SPHINX_DIRECTORY/figures/datasets_loaders_load_winequality.html
+
+    .. note::
+
+        VerticaPy offers a wide range of sample datasets that are
+        ideal for training and testing purposes. You can explore
+        the full list of available datasets in the :ref:`api.datasets`,
+        which provides detailed information on each dataset
+        and how to use them effectively. These datasets are invaluable
+        resources for honing your data analysis and machine learning
+        skills within the VerticaPy environment.
+
+    .. ipython:: python
+        :suppress:
+
+        import verticapy.datasets as vpd
+        data = vpd.load_winequality()
+
+    Model Initialization
+    ^^^^^^^^^^^^^^^^^^^^^
+
+    First we import the ``BisectingKMeans`` model:
+
+    .. code-block::
+
+        from verticapy.machine_learning.vertica import BisectingKMeans
+
+    .. ipython:: python
+        :suppress:
+
+        from verticapy.machine_learning.vertica import BisectingKMeans
+
+    Then we can create the model:
+
+    .. ipython:: python
+        :okwarning:
+
+        model = BisectingKMeans(
+            n_cluster = 8,
+            bisection_iterations = 1,
+            split_method = 'sum_squares',
+            min_divisible_cluster_size = 2,
+            distance_method = "euclidean",
+            init = "kmeanspp",
+            max_iter = 300,
+            tol = 1e-4
+        )
+
+    .. hint::
+
+        In ``verticapy`` 1.0.x and higher, you do not need to specify the
+        model name, as the name is automatically assigned. If you need to
+        re-use the model, you can fetch the model name from the model's
+        attributes.
+
+    .. important::
+
+        The model name is crucial for the model management system and
+        versioning. It's highly recommended to provide a name if you
+        plan to reuse the model later.
+
+    Model Training
+    ^^^^^^^^^^^^^^^
+
+    We can now fit the model:
+
+    .. ipython:: python
+        :okwarning:
+
+        model.fit(data, X = ["density", "sulphates"])
+
+    .. important::
+
+        To train a model, you can directly use the ``vDataFrame`` or the
+        name of the relation stored in the database. The test set is optional
+        and is only used to compute the test metrics. In ``verticapy``, we
+        don't work using ``X`` matrices and ``y`` vectors. Instead, we work
+        directly with lists of predictors and the response name.
+
+    .. hint::
+
+        For clustering and anomaly detection, the use of predictors is
+        optional. In such cases, all available predictors are considered,
+        which can include solely numerical variables or a combination of
+        numerical and categorical variables, depending on the model's
+        capabilities.
+
+    Metrics
+    ^^^^^^^^
+
+    You can also find the cluster positions by:
+
+    .. ipython:: python
+
+        model.clusters_
+
+    In order to get the size of each cluster, you can use:
+
+    .. ipython:: python
+
+        model.cluster_size_
+
+    To evaluate the model, various attributes are computed, such as 
+    the between sum of squares, the total within clusters sum of 
+    squares, and the total sum of squares.
+
+    .. ipython:: python
+
+        model.between_cluster_ss_
+        model.total_within_cluster_ss_
+        model.total_ss_
+
+    You also have access to the sum of squares of each cluster.
+
+    .. ipython:: python
+
+        model.cluster_i_ss_
+
+    Some other useful attributes can be used to evaluate the model, 
+    like the Elbow Score (the bigger it is, the better it is).
+
+    .. ipython:: python
+
+        model.elbow_score_
+
+    Prediction
+    ^^^^^^^^^^^
+
+    Predicting or ranking the dataset is straight-forward:
+
+    .. ipython:: python
+        :suppress:
+
+        result = model.predict(data, ["density", "sulphates"])
+        html_file = open("figures/machine_learning_vertica_bisect_km_prediction.html", "w")
+        html_file.write(result._repr_html_())
+        html_file.close()
+
+    .. code-block:: python
+
+        model.predict(data, ["density", "sulphates"])
+
+    .. raw:: html
+        :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_bisect_km_prediction.html
+
+    As shown above, a new column has been created, containing
+    the bisected cluster.
+
+    Plots - Cluster Plot
+    ^^^^^^^^^^^^^^^^^^^^^
+
+    Plots highlighting the different clusters can be easily drawn using:
+
+    .. code-block:: python
+
+        model.plot()
+
+    .. ipython:: python
+        :suppress:
+
+        vp.set_option("plotting_lib", "plotly")
+        fig = model.plot(width = 600)
+        fig.write_html("figures/machine_learning_vertica_bisect_km_plot.html")
+
+    .. raw:: html
+        :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_bisect_km_plot.html
+
+    Plots - Tree
+    ^^^^^^^^^^^^^
+
+    Tree models can be visualized by drawing their tree plots.
+    For more examples, check out :ref:`chart_gallery.tree`.
+
+    .. code-block:: python
+
+        model.plot_tree()
+
+    .. ipython:: python
+        :suppress:
+
+        res = model.plot_tree()
+        res.render(filename='figures/machine_learning_vertica_tree_bisect_km_', format='png')
+
+    .. image:: /../figures/machine_learning_vertica_tree_bisect_km_.png
+
+    .. note::
+
+        The above example may not render properly in the doc because
+        of the huge size of the tree. But it should render nicely
+        in jupyter environment.
+
+    In order to plot graph using `graphviz <https://graphviz.org/>`_
+    separately, you can extract the graphviz DOT file code as follows:
+
+    .. ipython:: python
+
+        model.to_graphviz()
+
+    This string can then be copied into a DOT file which can be
+    parsed by graphviz.
+
+    Plots - Contour
+    ^^^^^^^^^^^^^^^^
+
+    In order to understand the parameter space, we can also look
+    at the contour plots:
+
+    .. code-block:: python
+
+        model.contour()
+
+    .. ipython:: python
+        :suppress:
+
+        fig = model.contour(width = 600)
+        fig.write_html("figures/machine_learning_vertica_bisect_km_contour.html")
+
+    .. raw:: html
+        :file: SPHINX_DIRECTORY/figures/machine_learning_vertica_bisect_km_contour.html
+
+    .. note::
+
+        Machine learning models with two predictors can usually benefit
+        from their own contour plot. This visual representation aids in
+        exploring predictions and gaining a deeper understanding of how
+        these models perform in different scenarios. Please refer to
+        :ref:`chart_gallery.contour_plot` for more examples.
+
+    Parameter Modification
+    ^^^^^^^^^^^^^^^^^^^^^^^
+
+    In order to see the parameters:
+
+    .. ipython:: python
+
+        model.get_params()
+
+    And to manually change some of the parameters:
+
+    .. ipython:: python
+
+        model.set_params({'n_cluster': 5})
+
+    Model Register
+    ^^^^^^^^^^^^^^
+
+    In order to register the model for tracking and versioning:
+
+    .. code-block:: python
+
+        model.register("model_v1")
+
+    Please refer to :ref:`notebooks/ml/model_tracking_versioning/index.html`
+    for more details on model tracking and versioning.
+
+    Model Exporting
+    ^^^^^^^^^^^^^^^^
+
+    **To Memmodel**
+
+    .. code-block:: python
+
+        model.to_memmodel()
+
+    .. note::
+
+        ``MemModel`` objects serve as in-memory representations of machine
+        learning models. They can be used for both in-database and in-memory
+        prediction tasks. These objects can be pickled in the same way that
+        you would pickle a ``scikit-learn`` model.
+
+    The preceding methods for exporting the model use ``MemModel``, and it
+    is recommended to use ``MemModel`` directly.
+
+    **To SQL**
+
+    You can get the SQL query equivalent of the XGB model by:
+
+    .. ipython:: python
+
+        model.to_sql()
+
+    .. note:: This SQL query can be directly used in any database.
+
+    **Deploy SQL**
+
+    To get the SQL query which uses Vertica functions use below:
+
+    .. ipython:: python
+
+        model.deploySQL()
+
+    **To Python**
+
+    To obtain the prediction function in Python syntax, use the following code:
+
+    .. ipython:: python
+
+        X = [[0.9, 0.5]]
+        model.to_python()(X)
+
+    .. hint::
+
+        The
+        :py:mod:`verticapy.machine_learning.vertica.tree.BisectingKMeans.to_python`
+        method is used to retrieve the anomaly score.
+        For specific details on how to
+        use this method for different model types, refer to the relevant
+        documentation for each model.
     """
 
     # Properties.