From 512900d0733ba07e48e8680aff9b4a79241de0cc Mon Sep 17 00:00:00 2001 From: Badr Date: Wed, 23 Oct 2024 06:53:56 -0400 Subject: [PATCH] first correction --- .../user_guide_data_exploration_charts.rst | 32 +- ...ata_exploration_descriptive_statistics.rst | 20 +- docs/source/user_guide_data_ingestion.rst | 17 +- ...user_guide_introduction_best_practices.rst | 297 +++++++----------- .../user_guide_introduction_installation.rst | 39 +-- ..._guide_machine_learning_model_tracking.rst | 195 +++--------- 6 files changed, 217 insertions(+), 383 deletions(-) diff --git a/docs/source/user_guide_data_exploration_charts.rst b/docs/source/user_guide_data_exploration_charts.rst index b33d42a22..c254406b7 100644 --- a/docs/source/user_guide_data_exploration_charts.rst +++ b/docs/source/user_guide_data_exploration_charts.rst @@ -28,13 +28,16 @@ Let's start with pies and histograms. Drawing the pie or histogram of a categori .. code-block:: + # Setting the plotting lib vp.set_option("plotting_lib", "highcharts") + titanic = load_titanic() titanic["pclass"].bar() .. ipython:: python :suppress: + # Setting the plotting lib vp.set_option("plotting_lib", "highcharts") titanic = load_titanic() fig = titanic["pclass"].bar() @@ -141,20 +144,21 @@ You can also change the occurences by another aggregation with the `method` and .. raw:: html :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_age_hist_avs.html - VerticaPy uses the same process for other graphics, like 2-dimensional histograms and bar charts. Let us showcase another plotting library for these plots. - .. code-block:: - + + # Setting the plotting lib vp.set_option("plotting_lib", "plotly") + titanic.bar(["pclass", "survived"]) .. ipython:: python :suppress: + # Setting the plotting lib vp.set_option("plotting_lib", "plotly") fig = titanic.bar(["pclass", "survived"]) fig.write_html("/project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_bar_pclass_surv.html") @@ -162,7 +166,6 @@ Let us showcase another plotting library for these plots. .. raw:: html :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_bar_pclass_surv.html - .. note:: VerticaPy has three main plotting libraries. Look at :ref:`chart_gallery` section for all the different plots. .. code-block:: @@ -242,7 +245,7 @@ Box plots are useful for understanding statistical dispersion. .. raw:: html :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_boxplot_one.html -Scatter and bubble plots are also useful for identifying patterns in your data. Note, however, that these methods don't use aggregations; VerticaPy downsamples the data before plotting. You can use the 'max_nb_points' to limit the number of points and avoid unnecessary memory usage. +Scatter and bubble plots are also useful for identifying patterns in your data. Note, however, that these methods don't use aggregations; VerticaPy downsamples the data before plotting. You can use the `max_nb_points` to limit the number of points and avoid unnecessary memory usage. .. code-block:: @@ -323,8 +326,10 @@ For more information on scatter look at :py:mod:`verticapy.vDataFrame.scatter`. Hexbin plots can be useful for generating heatmaps. These summarize data in a similar way to scatter plots, but compute aggregations to get the final results. .. ipython:: python - + + # Setting the plotting lib vp.set_option("plotting_lib", "matplotlib") + @savefig user_guides_data_exploration_iris_hexbin.png iris.hexbin( ["SepalLengthCm", "SepalWidthCm"], @@ -337,6 +342,7 @@ Hexbin, scatter, and bubble plots also allow you to provide a background image. .. code-block:: python africa = load_africa_education() + # displaying avg students score in Africa africa.hexbin( ["lon", "lat"], @@ -349,6 +355,7 @@ Hexbin, scatter, and bubble plots also allow you to provide a background image. :suppress: africa = load_africa_education() + # displaying avg students score in Africa @savefig user_guides_data_exploration_africa_hexbin.png africa.hexbin( @@ -360,17 +367,6 @@ Hexbin, scatter, and bubble plots also allow you to provide a background image. It is also possible to use SHP datasets to draw maps. -.. code-block:: python - - africa = load_africa_education() - # displaying avg students score in Africa - africa.hexbin( - ["lon", "lat"], - method = "avg", - of = "zralocp", - img = "img/africa.png", - ) - .. ipython:: python # Africa Dataset @@ -412,7 +408,9 @@ Since time-series plots do not aggregate the data, it's important to choose the :suppress: :okwarning: + # Setting the plotting lib vp.set_option("plotting_lib", "plotly") + fig = amazon["number"].plot( ts = "date", by = "state", diff --git a/docs/source/user_guide_data_exploration_descriptive_statistics.rst b/docs/source/user_guide_data_exploration_descriptive_statistics.rst index 5fddc77a5..992b02955 100644 --- a/docs/source/user_guide_data_exploration_descriptive_statistics.rst +++ b/docs/source/user_guide_data_exploration_descriptive_statistics.rst @@ -17,7 +17,7 @@ The :py:func:`~verticapy.vDataFrame.agg` method is the best way to compute multi help(vp.vDataFrame.agg) This is a tremendously useful function for understanding your data. -Let's use the `churn dataset `_ +Let's use the `churn dataset `_ .. code-block:: @@ -122,7 +122,9 @@ You can also use the 'groupby' method to compute customized aggregations. "gender", "Contract", ], - ["AVG(DECODE(Churn, 'Yes', 1, 0)) AS Churn"], + [ + "AVG(DECODE(Churn, 'Yes', 1, 0)) AS Churn", + ], ) .. ipython:: python @@ -133,7 +135,9 @@ You can also use the 'groupby' method to compute customized aggregations. "gender", "Contract", ], - ["AVG(DECODE(Churn, 'Yes', 1, 0)) AS Churn"], + [ + "AVG(DECODE(Churn, 'Yes', 1, 0)) AS Churn", + ], ) html_file = open("/project/data/VerticaPy/docs/figures/user_guides_data_exploration_descriptive_stats_group_by.html", "w") html_file.write(res._repr_html_()) @@ -148,7 +152,10 @@ You can also use the 'groupby' method to compute customized aggregations. import verticapy.sql.functions as fun vdf.groupby( - ["gender", "Contract"], + [ + "gender", + "Contract", + ], [ fun.min(vdf["tenure"])._as("min_tenure"), fun.max(vdf["tenure"])._as("max_tenure"), @@ -161,7 +168,10 @@ You can also use the 'groupby' method to compute customized aggregations. import verticapy.sql.functions as fun res = vdf.groupby( - ["gender", "Contract"], + [ + "gender", + "Contract", + ], [ fun.min(vdf["tenure"])._as("min_tenure"), fun.max(vdf["tenure"])._as("max_tenure"), diff --git a/docs/source/user_guide_data_ingestion.rst b/docs/source/user_guide_data_ingestion.rst index 46b1ff98c..38658802a 100644 --- a/docs/source/user_guide_data_ingestion.rst +++ b/docs/source/user_guide_data_ingestion.rst @@ -111,6 +111,7 @@ To ingest the file into Vertica, remove the `genSQL` parameter from the above co :file: /project/data/VerticaPy/docs/figures/user_guide_data_ingestion_iris.html When the file to ingest is not located on your local machine, and is on the server instead, then you must set the `ingest_local` parameter to False. + `ingest_local` is True by default. .. note:: In some cases where the CSV file has a very complex structure, local ingestion might fail. If this occurs, you will have to move the file into the database and then ingest the file from that location. @@ -130,8 +131,7 @@ syntax in the path parameter (in this case for multiple CSV files): `path = "pat Ingest CSV files ---------------- -In addition to :py:func:`~verticapy.read_file`, you can also ingest CSV files with the :py:func:`~verticapy.read_csv` function, -which ingests the file using flex tables. This function provides options not available in :py:func:`~verticapy.read_file`, such as: +In addition to :py:func:`~verticapy.read_file`, you can also ingest CSV files with the :py:func:`~verticapy.read_csv` function, which ingests the file using flex tables. This function provides options not available in :py:func:`~verticapy.read_file`, such as: - `sep`: specify the column separator. - `parse_nrows`: the function creates a file of nrows from the data file to identify @@ -140,9 +140,7 @@ the data types. This file is then dropped and the entire data file is ingested. For a full list of supported options, see :py:func:`~verticapy.read_csv` or use the :py:func:`~verticapy.help` function. -In the following example, we will use :py:func:`~verticapy.read_csv` to ingest a -subset of the Titanic dataset. To begin, load the entire Titanic dataset using the -:py:func:`~verticapy.datasets.load_titanic` function: +In the following example, we will use :py:func:`~verticapy.read_csv` to ingest a subset of the Titanic dataset. To begin, load the entire Titanic dataset using the :py:func:`~verticapy.datasets.load_titanic` function: .. ipython:: python @@ -150,8 +148,7 @@ subset of the Titanic dataset. To begin, load the entire Titanic dataset using t titanic = load_titanic() -To convert a subset of the dataset to a CSV file, select the desired rows in -the dataset and use the :py:func:`~verticapy.to_csv` vDataFrame method: +To convert a subset of the dataset to a CSV file, select the desired rows in the dataset and use the :py:func:`~verticapy.to_csv` ``vDataFrame`` method: .. ipython:: python @@ -163,7 +160,8 @@ Before ingesting the above CSV file, we can check its columns and their data typ .. ipython:: python - vp.pcsv(path = "titanic_subset.csv", + vp.pcsv( + path = "titanic_subset.csv", sep = ",", na_rep = "", ) @@ -212,8 +210,7 @@ For a full list of supported options, see the :py:func:`~verticapy.read_json` or VerticaPy also provides a :py:func:`~verticapy.pjson` function to parse JSON files to identify columns and their respective data types. -In the following example, we load the iris dataset using the :py:func:`~verticapy.datasets.load_iris` dataset, -convert the vDataFrame to JSON format with the :py:func:`~verticapy.to_json` method, then ingest the JSON file into Vetica: +In the following example, we load the iris dataset using the :py:func:`~verticapy.datasets.load_iris` dataset, convert the vDataFrame to JSON format with the :py:func:`~verticapy.to_json` method, then ingest the JSON file into Vetica: .. code-block:: python diff --git a/docs/source/user_guide_introduction_best_practices.rst b/docs/source/user_guide_introduction_best_practices.rst index 35900432b..401ecc02d 100644 --- a/docs/source/user_guide_introduction_best_practices.rst +++ b/docs/source/user_guide_introduction_best_practices.rst @@ -3,41 +3,22 @@ Best practices =============== - In this tutorial, we will explore some best practices and optimizations to help you get the most out of Vertica and VerticaPy. Restrict objects and operations to essential columns ------------------------------------------------------- -As VerticaPy is effectively an abstraction of SQL, any database-level -optimizations you make in your Vertica database carry over to VerticaPy. -In Vertica, optimization is centered on projections, which are collections -of table columns—from one or more tables—stored on disk in a format that -optimizes query execution. When you write queries in terms of the original -tables, the query uses the projections to return query results. For details -about creating and designing projections, see the Projections section in the Vertica documentation. - -Projections are created and managed in the Vertica database, but you can -leverage the power of projections in VerticaPy with features such as the -`vDataFrame`'s usecols parameter, which specifies the columns from the input -relation to include in the `vDataFrame`. As columnar databases perform better -when there are fewer columns in the query, especially when you are working -with large datasets, limiting `vDataFrame` and operations to essential -columns can lead to a significant performance improvement. By default, -most `vDataFrame` methods use all numerical columns in the `vDataFrame`, -but you can restrict the operation to specific columns. - -In the following examples, we'll demonstrate how to create a `vDataFrame` -from specific columns in the input relation, and then run methods on that -`vDataFrame`. First, load the titanic dataset into Vertica using the -:py:func:`~verticapy.datasets.load_titanic` function: +As VerticaPy is effectively an abstraction of SQL, any database-level optimizations you make in your Vertica database carry over to VerticaPy. In Vertica, optimization is centered on projections, which are collections of table columns—from one or more tables—stored on disk in a format that optimizes query execution. When you write queries in terms of the original tables, the query uses the projections to return query results. For details about creating and designing projections, see the Projections section in the Vertica documentation. + +Projections are created and managed in the Vertica database, but you can leverage the power of projections in VerticaPy with features such as the `vDataFrame`'s usecols parameter, which specifies the columns from the input relation to include in the `vDataFrame`. As columnar databases perform better when there are fewer columns in the query, especially when you are working with large datasets, limiting `vDataFrame` and operations to essential columns can lead to a significant performance improvement. By default, most `vDataFrame` methods use all numerical columns in the `vDataFrame`, but you can restrict the operation to specific columns. +In the following examples, we'll demonstrate how to create a `vDataFrame` from specific columns in the input relation, and then run methods on that `vDataFrame`. First, load the titanic dataset into Vertica using the :py:func:`~verticapy.datasets.load_titanic` function: .. code-block:: python -from verticapy.datasets import load_titanic + from verticapy.datasets import load_titanic -load_titanic() + load_titanic() .. ipython:: python :suppress: @@ -51,28 +32,27 @@ load_titanic() .. raw:: html :file: SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_laod_titanic.html -Supposing we are only interested in the 'survived', 'pclass', 'age', 'parch', and 'sibsp' columns, -we can create a vDataFrame with just those columns by specifying them in the usecols parameter: +Supposing we are only interested in the 'survived', 'pclass', 'age', 'parch', and 'sibsp' columns, we can create a vDataFrame with just those columns by specifying them in the usecols parameter: .. code-block:: python import verticapy as vp - vdf = vp.vDataFrame( + titanic = vp.vDataFrame( "public.titanic", - usecols = ["survived", "pclass", "age", "parch", "sibsp"] + usecols = ["survived", "pclass", "age", "parch", "sibsp"], ) - display(vdf) + titanic.head(100) .. ipython:: python :suppress: import verticapy as vp - vdf = vp.vDataFrame( + titanic = vp.vDataFrame( "public.titanic", - usecols = ["survived", "pclass", "age", "parch", "sibsp"] + usecols = ["survived", "pclass", "age", "parch", "sibsp"], ) - res = vdf + res = titanic.head(100) html_file = open("SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_laod_titanic_selective.html", "w") html_file.write(res._repr_html_()) html_file.close() @@ -85,148 +65,140 @@ If we run the :py:func:`~verticapy.vDataFrame.avg` method without specifying col .. note:: To examine the generated SQL for each command, turn on the "sql_on" option using :py:func:`~verticapy.set_option`. .. ipython:: python - + + # Turning on SQL. vp.set_option("sql_on", True) - vdf.avg() + + titanic.avg() To turn off the SQL code generation option: .. ipython:: python - + + # Turning off SQL. vp.set_option("sql_on", False) -To restrict the operation to specific columns in the vDataFrame, provide the column names in the `columns` parameter: +To restrict the operation to specific columns in the ``vDataFrame``, provide the column names in the `columns` parameter: .. code-block:: python - vdf.avg(columns = ["age", "survived"]) + titanic.avg(columns = ["age", "survived"]) .. ipython:: python :suppress: - res = vdf.avg(columns = ["age", "survived"]) - html_file = open("SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_vdf_avg.html", "w") + res = titanic.avg(columns = ["age", "survived"]) + html_file = open("SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_titanic_avg.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_vdf_avg.html - -As we are working with a small dataset, the perfomance impact of -excluding unncessary columns is not very significant. However, with large -datasets (e.g. greater than a TB), the impact is much greater, and choosing -essential columns becomes a key step in improving performance. + :file: SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_titanic_avg.html +As we are working with a small dataset, the perfomance impact of excluding unncessary columns is not very significant. However, with large datasets (e.g. greater than a TB), the impact is much greater, and choosing essential columns becomes a key step in improving performance. Instead of specifying essential columns to include, some methods allow you to list the columns to exclude with the `exclude_columns` parameter: - .. ipython:: python - vdf.numcol(exclude_columns = ["parch", "sibsp"]) + titanic.numcol(exclude_columns = ["parch", "sibsp"]) .. note:: - To list all columns in a vDataFrame, including non-numerical columns, use the :py:func:`~verticapy.vDataFrame.get_columns` method. + To list all columns in a ``vDataFrame``, including non-numerical columns, use the :py:func:`~verticapy.vDataFrame.get_columns` method. You can then use this truncated list of columns in another method call; for instance, to compute a correlation matrix: .. code-block:: python - vdf.corr(columns = vdf.numcol(exclude_columns = ["parch", "sibsp"])) + titanic.corr(columns = titanic.numcol(exclude_columns = ["parch", "sibsp"])) .. ipython:: python :suppress: vp.set_option("plotting_lib", "plotly") - fig = vdf.corr(columns = vdf.numcol(exclude_columns = ["parch", "sibsp"])) - fig.write_html("SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_vdf_corr.html") + fig = titanic.corr(columns = titanic.numcol(exclude_columns = ["parch", "sibsp"])) + fig.write_html("SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_titanic_corr.html") .. raw:: html - :file: SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_vdf_corr.html + :file: SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_titanic_corr.html Save the current relation -------------------------- -The vDataFrame works like a `view`, -a stored query that encapsulates one or more SELECT statements. +The ``vDataFrame`` works like a `view`, a stored query that encapsulates one or more SELECT statements. If the generated relation uses many different functions, the computation time for each method call is greatly increased. -Small transformations don't drastically slow down computation, -but heavy transformations (multiple joins, frequent use of advanced analytical funcions, moving windows, etc.) can result in noticeable slowdown. When performing computationally expensive operations, you can aid performance by saving the vDataFrame structure as a table in the Vertica database. We will demonstrate this process in the following example. +Small transformations don't drastically slow down computation, but heavy transformations (multiple joins, frequent use of advanced analytical funcions, moving windows, etc.) can result in noticeable slowdown. When performing computationally expensive operations, you can aid performance by saving the vDataFrame structure as a table in the Vertica database. We will demonstrate this process in the following example. -First, create a vDataFrame, then perform some operations on that `vDataFrame`: +First, create a ``vDataFrame``, then perform some operations on that `vDataFrame`: .. code-block:: python - vdf = vp.vDataFrame("public.titanic") - vdf["sex"].label_encode()["boat"].fillna(method = "0ifnull")["name"].str_extract( + titanic = vp.vDataFrame("public.titanic") + titanic["sex"].label_encode()["boat"].fillna(method = "0ifnull")["name"].str_extract( ' ([A-Za-z]+)\.').eval("family_size", expr = "parch + sibsp + 1").drop( columns = ["cabin", "body", "ticket", "home.dest"])["fare"].fill_outliers().fillna() .. ipython:: python :suppress: - vdf = vp.vDataFrame("public.titanic") - vdf["sex"].label_encode()["boat"].fillna(method = "0ifnull")["name"].str_extract(' ([A-Za-z]+)\.').eval("family_size", expr = "parch + sibsp + 1").drop(columns = ["cabin", "body", "ticket", "home.dest"])["fare"].fill_outliers().fillna() - - + titanic = vp.vDataFrame("public.titanic") + titanic["sex"].label_encode()["boat"].fillna(method = "0ifnull")["name"].str_extract(' ([A-Za-z]+)\.').eval("family_size", expr = "parch + sibsp + 1").drop(columns = ["cabin", "body", "ticket", "home.dest"])["fare"].fill_outliers().fillna() .. ipython:: python - print(vdf.current_relation()) - + print(titanic.current_relation()) To understand how Vertica executes the different aggregations in the above relation, let's take a look at the query plan: - - .. note:: python - Query plans can be hard to interpret if you don't - know how to parse them. For more information, see - `query plan information and structure `_. + Query plans can be hard to interpret if you don't know how to parse them. For more information, see `query plan information and structure `_. .. ipython:: python - print(vdf.explain()) + print(titanic.explain()) + +Looking at the plan and its associated relation, it's clear that the transformations we applied to the vDataFrame result in a complicated relation. -Looking at the plan and its associated relation, it's clear that the -transformations we applied to the vDataFrame result in a complicated relation. -Each method call to the vDataFrame must use this relation for computation. +Each method call to the ``vDataFrame`` must use this relation for computation. .. note:: To better understand your queries, check out the :ref:`~verticapy.performance.vertica.qprof.QueryProfiler` function. -To save the relation as a table in the Vertica and replace the current -relation in VerticaPy with the new table relation, use the -`to_db() `_ -method with the `inplace` parameter set to True: +To save the relation as a table in the Vertica and replace the current relation in VerticaPy with the new table relation, use the ``to_db()`` method with the `inplace` parameter set to True: .. code-block:: python - vp.drop("public.titanic_clean", method = "table") # drops any existing table with the same schema and name - vdf.to_db("public.titanic_clean", - relation_type = "table", - inplace = True) + vp.drop( + "public.titanic_clean", + method = "table", + ) # drops any existing table with the same schema and name + titanic.to_db("public.titanic_clean", + relation_type = "table", + inplace = True, + ) .. ipython:: python :suppress: - vp.drop("public.titanic_clean", method = "table") # drops any existing table with the same schema and name - vdf.to_db("public.titanic_clean", - relation_type = "table", - inplace = True) + vp.drop( + "public.titanic_clean", + method = "table", + ) # drops any existing table with the same schema and name + titanic.to_db( + "public.titanic_clean", + relation_type = "table", + inplace = True, + ) .. ipython:: python - print(vdf.current_relation()) + print(titanic.current_relation()) - -When dealing with very large datasets, it's best to take caution before saving relations with -complicated transformations. Ideally, you will perform a -thorough data exploration, and only execute heavy transformations when essential. +When dealing with very large datasets, it's best to take caution before saving relations with complicated transformations. Ideally, you will perform a thorough data exploration, and only execute heavy transformations when essential. Use the help function ---------------------- @@ -240,11 +212,7 @@ For a quick and convenient way to view information about an object or function, Close your connections ----------------------- -Each connection to the database increases the concurrency on the system, -so try to close connections when you're done with them. -VerticaPy simplifies the connection process by allowing the user to -create an auto-connection, but the closing of connections must be -done manually with the :ref:`~verticapy.close_connection` function. +Each connection to the database increases the concurrency on the system, so try to close connections when you're done with them. VerticaPy simplifies the connection process by allowing the user to create an auto-connection, but the closing of connections must be done manually with the :ref:`~verticapy.close_connection` function. To demonstrate, create a database connection: @@ -263,76 +231,66 @@ It is especially important to close connections when you are working in an envir Consider a method's time complexity -------------------------------------- -Some techniques are significantly more computationally expensive than others. -For example, a Kendall correlation is very expensive compared to a -Pearson correlation because, unlike Pearson, Kendall correlations -use a cross join, resulting in a time complexity of O(n*n) (where n is the number of rows). +Some techniques are significantly more computationally expensive than others. For example, a Kendall correlation is very expensive compared to a Pearson correlation because, unlike Pearson, Kendall correlations use a cross join, resulting in a time complexity of O(n*n) (where n is the number of rows). + Let's compare the time needed to compute these two correlations on the 'titanic' dataset: .. ipython:: python import time - vdf = vp.vDataFrame("public.titanic") + titanic = vp.vDataFrame("public.titanic") start_time = time.time() - x = vdf.corr(method = "pearson", show = False) + x = titanic.corr(method = "pearson", show = False) print("Pearson, time: {0}".format(time.time() - start_time)) start_time = time.time() - x = vdf.corr(method = "kendall", show = False) + x = titanic.corr(method = "kendall", show = False) print("Kendall, time: {0}".format(time.time() - start_time)) Limit plot elements -------------------- -Graphics are an essential tool to understand your data, -but they can become difficult to parse if they contain -too many elements. VerticaPy provides options that -restrict plots to specified elements. To demonstrate, -let's first draw a multi-histogram with a categorical -column with thousands of categories: +Graphics are an essential tool to understand your data, but they can become difficult to parse if they contain +too many elements. VerticaPy provides options that restrict plots to specified elements. To demonstrate, let's first draw a multi-histogram with a categorical column with thousands of categories: .. code-block:: python - vdf.bar(["name", "survived"]) + titanic.bar(["name", "survived"]) .. ipython:: python :suppress: - fig = vdf.bar(["name", "survived"], width = 900) - fig.write_html("SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_vdf_bar_plot.html") + fig = titanic.bar(["name", "survived"], width = 900) + fig.write_html("SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_titanic_bar_plot.html") .. raw:: html - :file: SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_vdf_bar_plot.html + :file: SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_titanic_bar_plot.html -VerticaPy outputs the bar chart, but the number of categories -makes the graph basically incomprehensible. Instead, whenever -possible, try to create graphics with as few categories as -possible for your use case: +VerticaPy outputs the bar chart, but the number of categories makes the graph basically incomprehensible. Instead, whenever possible, try to create graphics with as few categories as possible for your use case: .. code-block:: python - vdf.hist(["pclass", "survived"]) + titanic.hist(["pclass", "survived"]) .. ipython:: python :suppress: - fig = vdf.hist(["pclass", "survived"]) - fig.write_html("SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_vdf_hist_plot.html") + fig = titanic.hist(["pclass", "survived"]) + fig.write_html("SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_titanic_hist_plot.html") .. raw:: html - :file: SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_vdf_hist_plot.html - + :file: SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_titanic_hist_plot.html To view the cardinality of your variables, use the :ref:`~verticapy.vDataFrame.nunique` method: .. code-block:: python - vdf.nunique() + titanic.nunique() .. ipython:: python :suppress: - res = vdf.nunique() + res = titanic.nunique() html_file = open("SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_nunqiue.html", "w") html_file.write(res._repr_html_()) html_file.close() @@ -343,24 +301,19 @@ To view the cardinality of your variables, use the :ref:`~verticapy.vDataFrame.n Filter unnecessary data ------------------------ -Filtering your data is a crucial step in data preparation. -Proper filtering avoids unnecessary computations and greatly -improves the performance of each method call. While the -performance impact can be minimal for small datasets, -filtering large datasets is key to improving performance. +Filtering your data is a crucial step in data preparation. Proper filtering avoids unnecessary computations and greatly +improves the performance of each method call. While the performance impact can be minimal for small datasets, filtering large datasets is key to improving performance. -For example, if we are only interested in analyzing Titanic -passengers who didn't have a lifeboat, we can filter on -this requirement using the :ref:`~verticapy.vDataFrame.filter` method: +For example, if we are only interested in analyzing Titanic passengers who didn't have a lifeboat, we can filter on this requirement using the :ref:`~verticapy.vDataFrame.filter` method: .. code-block:: python - vdf.filter("boat IS NOT NULL") + titanic.filter("boat IS NOT NULL") .. ipython:: python :suppress: - res = vdf.filter("boat IS NOT NULL") + res = titanic.filter("boat IS NOT NULL") html_file = open("SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_filter.html", "w") html_file.write(res._repr_html_()) html_file.close() @@ -368,17 +321,16 @@ this requirement using the :ref:`~verticapy.vDataFrame.filter` method: .. raw:: html :file: SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_filter.html - To drop unnecessary columns from your vDataFrame, use the :ref:`~verticapy.vDataFrame.drop` method: .. code-block:: python - vdf.drop(["name", "body"]) + titanic.drop(["name", "body"]) .. ipython:: python :suppress: - res = vdf.drop(["name", "body"]) + res = titanic.drop(["name", "body"]) html_file = open("SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_drop_name_body.html", "w") html_file.write(res._repr_html_()) html_file.close() @@ -386,21 +338,17 @@ To drop unnecessary columns from your vDataFrame, use the :ref:`~verticapy.vData .. raw:: html :file: SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_drop_name_body.html -The dropped columns are excluded from the relation's generated SELECT query: +The dropped columns are excluded from the relation's generated ``SELECT`` query: .. ipython:: python - print(vdf.current_relation()) + print(titanic.current_relation()) Maximize your resources ------------------------ -Large datasets often contain hundreds of columns. -These datasets require VerticaPy to compute many -concurrent, resource-intensive aggregations. To limit -the impact of these aggregations, you can control the -number of queries that VerticaPy sends to the system, -which allows for some useful optimizations. +Large datasets often contain hundreds of columns. These datasets require VerticaPy to compute many +concurrent, resource-intensive aggregations. To limit the impact of these aggregations, you can control the number of queries that VerticaPy sends to the system, which allows for some useful optimizations. In the following example, we'll explore a couple of these optimizations. First, generate a dataset: @@ -408,30 +356,40 @@ In the following example, we'll explore a couple of these optimizations. First, from verticapy.datasets import gen_dataset - vp.drop("public.test_dataset", method= "table") # drop an existing table with the same schema and name + vp.drop("public.test_dataset", method = "table") # drop an existing table with the same schema and name features_ranges = {} for i in range(20): features_ranges[f"x{i}"] = {"type": float, "range": [0, 1]} vp.drop("test_dataset", method = "table") - vdf = gen_dataset(features_ranges, nrows = 100000).to_db("test_dataset", - relation_type = "table", - inplace = True) - display(vdf) + vdf = gen_dataset( + features_ranges, + nrows = 100000, + ).to_db( + "test_dataset", + relation_type = "table", + inplace = True, + ) + vdf.head(100) .. ipython:: python :suppress: from verticapy.datasets import gen_dataset - vp.drop("public.test_dataset", method= "table") # drop an existing table with the same schema and name + vp.drop("public.test_dataset", method = "table") # drop an existing table with the same schema and name features_ranges = {} for i in range(20): features_ranges[f"x{i}"] = {"type": float, "range": [0, 1]} vp.drop("test_dataset", method = "table") - vdf = gen_dataset(features_ranges, nrows = 100000).to_db("test_dataset", - relation_type = "table", - inplace = True) - res = vdf + vdf = gen_dataset( + features_ranges, + nrows = 100000, + ).to_db( + "test_dataset", + relation_type = "table", + inplace = True, + ) + res = vdf.head(100) html_file = open("SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_gen_dataset.html", "w") html_file.write(res._repr_html_()) html_file.close() @@ -439,9 +397,7 @@ In the following example, we'll explore a couple of these optimizations. First, .. raw:: html :file: SPHINX_DIRECTORY/figures/user_guide_introduction_best_practices_gen_dataset.html - -To monitor how VerticaPy is computing the aggregations, -use the :py:func:`~verticapy.set_option` function to turn on SQL code generation and turn off cache: +To monitor how VerticaPy is computing the aggregations, use the :py:func:`~verticapy.set_option` function to turn on SQL code generation and turn off cache: .. ipython:: python @@ -450,35 +406,24 @@ use the :py:func:`~verticapy.set_option` function to turn on SQL code generation VerticaPy allows you to send multiple queries, either iteratively or concurrently, to the database when computing aggregations. -First, let's send a single query to compute the average for all columns in the vDataFrame: +First, let's send a single query to compute the average for all columns in the ``vDataFrame``: .. ipython:: python display(vdf.avg(ncols_block = 20)) - We see that there was one SELECT query for all columns in the `vDataFrame`. -You can reduce the impact on the system by using the `ncols_block` -parameter to split the computation into multiple iterative queries, -where the value of the parameter is the number of columns included in each query. +You can reduce the impact on the system by using the `ncols_block` parameter to split the computation into multiple iterative queries, where the value of the parameter is the number of columns included in each query. -For example, setting `ncols_block` to 5 will split the computation, -which consists of 20 total columns, into 4 separate queries, each of -which computes the average for 5 columns: +For example, setting `ncols_block` to 5 will split the computation, which consists of 20 total columns, into 4 separate queries, each of which computes the average for 5 columns: .. ipython:: python display(vdf.avg(ncols_block = 5)) - -In addition to spliting up the computation into separate queries, -you can send multiple queries to the database concurrently. -You specify the number of concurrent queries with the `processes` -parameter, which defines the number of workers involved in the -computation. Each child process creates a DB connection and -then sends its query. In the following example, we use 4 'processes': +In addition to spliting up the computation into separate queries, you can send multiple queries to the database concurrently. +You specify the number of concurrent queries with the `processes` parameter, which defines the number of workers involved in the computation. Each child process creates a DB connection and then sends its query. In the following example, we use 4 'processes': .. code-block:: python - vdf.avg(ncols_block = 5, processes = 4) - + vdf.avg(ncols_block = 5, processes = 4) \ No newline at end of file diff --git a/docs/source/user_guide_introduction_installation.rst b/docs/source/user_guide_introduction_installation.rst index b1df1e547..8a07f51b9 100644 --- a/docs/source/user_guide_introduction_installation.rst +++ b/docs/source/user_guide_introduction_installation.rst @@ -3,47 +3,38 @@ Connect to a Vertica database ============================== - -The following tutorial demonstrates a quick and easy way to connect -to a Vertica database using VerticaPy. For a full exploration of -the connection possibilities in VerticaPy, including auto-connections, see Connection. - +The following tutorial demonstrates a quick and easy way to connect to a Vertica database using VerticaPy. For a full exploration of the connection possibilities in VerticaPy, including auto-connections, see Connection. Requirements -------------- Before connecting to a database, you must satisfy the following requirements: - - Have access to a machine with Vertica version 9 or later installed - Install Python 3.9 or later on your machine - Install VerticaPy on your machine - For more information about these installations, see :ref:`gettting_started`. - Connect to a DB ---------------- - -To connect to a database for the first time, -use the :py:func:`verticapy.new_connection` function, replacing -the configuration values with the credentials for your database: +To connect to a database for the first time, use the :py:func:`verticapy.new_connection` function, replacing the configuration values with the credentials for your database: .. code-block:: python import verticapy as vp + vp.new_connection( { "host": "12.345.67.89", "port": "5433", "database": "testdb", "password": "XxX", - "user": "dbadmin" + "user": "dbadmin", }, - name = "Vertica_Connection" + name = "Vertica_Connection", ) .. ipython:: python @@ -51,18 +42,13 @@ the configuration values with the credentials for your database: import verticapy as vp - -The connection is saved to the VerticaPy connection file under -the name specified in the name parameter. To reconnect to -the database using this connection, run the :py:func:`verticapy.connect` -function with the name of the connection as the argument value: - +The connection is saved to the VerticaPy connection file under the name specified in the name parameter. To reconnect to +the database using this connection, run the :py:func:`verticapy.connect` function with the name of the connection as the argument value: .. code-block:: python vp.connect("Vertica_Connection") - To view all available connections, use :py:func:`verticapy.available_connection`. .. code-block:: python @@ -71,23 +57,18 @@ To view all available connections, use :py:func:`verticapy.available_connection` Out: ['Vertica_New_Connection', 'Vertica_Connection'] - -If you need to confirm the parameters for a given function, -you can also use the help function: +If you need to confirm the parameters for a given function, you can also use the help function: .. ipython:: python help(vp.new_connection) - -For an interactive start guide, you can use the help_start() function: +For an interactive start guide, you can use the ``help_start()`` function: .. code-block:: python vp.help_start() - .. image:: ../../source/_static/website/user_guides/introduction/user_guide_installation_help_start.PNG :width: 50% - :align: center - + :align: center \ No newline at end of file diff --git a/docs/source/user_guide_machine_learning_model_tracking.rst b/docs/source/user_guide_machine_learning_model_tracking.rst index 690fb74fa..bdb7a3e83 100644 --- a/docs/source/user_guide_machine_learning_model_tracking.rst +++ b/docs/source/user_guide_machine_learning_model_tracking.rst @@ -7,50 +7,16 @@ Model Tracking and Versioning Introduction ------------- +VerticaPy is an open-source Python package on top of Vertica database that supports pandas-like virtual dataframes over database relations. VerticaPy provides scikit-type machine learning functionality on these virtual dataframes. Data is not moved out of the database while performing machine learning or statistical analysis on virtual dataframes. Instead, the computations are done at scale in a distributed fashion inside the Vertica cluster. VerticaPy also takes advantage of multiple Python libraries to create a variety of charts, providing a quick and easy method to illustrate your statistical data. -VerticaPy is an open-source Python package on top of -Vertica database that supports pandas-like virtual -dataframes over database relations. VerticaPy provides -scikit-type machine learning functionality on these -virtual dataframes. Data is not moved out of the -database while performing machine learning or -statistical analysis on virtual dataframes. Instead, -the computations are done at scale in a distributed -fashion inside the Vertica cluster. VerticaPy also -takes advantage of multiple Python libraries to -create a variety of charts, providing a quick and -easy method to illustrate your statistical data. - -In this article, we will introduce two new MLOps -tools recently added to VerticaPy: Model Tracking -and Model Versioning. +In this article, we will introduce two new MLOps tools recently added to VerticaPy: Model Tracking and Model Versioning. Model Tracking --------------- -Data scientists usually train many ML models for a -roject. To help choose the best model, data -scientists need a way to keep track of all -candidate models and compare them using various -metrics. VerticaPy provides a model tracking system -to facilitate this process for a given experiment. -The data scientist first creates an experiment object -and then adds candidate models to that experiment. The -information related to each experiment can be -automatically backed up in the database, so if the -Python environment is closed for any reason, like a -holiday, the data scientist has peace of mind that -the experiment can be easily retrieved. The experiment -object also provides methods to easily compare the -prediction performance of its associated models and -to pick the model with the best performance on a -specific test dataset. - -The following example demonstrates how the model -tracking feature can be used for an experiment that -trains a few binary-classifier models on the Titanic -dataset. First, we must load the titanic data into our -database and store it as a virtual dataframe (vDF): +Data scientists usually train many ML models for a project. To help choose the best model, data scientists need a way to keep track of all candidate models and compare them using various metrics. VerticaPy provides a model tracking system to facilitate this process for a given experiment. The data scientist first creates an experiment object and then adds candidate models to that experiment. The information related to each experiment can be automatically backed up in the database, so if the Python environment is closed for any reason, like a holiday, the data scientist has peace of mind that the experiment can be easily retrieved. The experiment object also provides methods to easily compare the prediction performance of its associated models and to pick the model with the best performance on a specific test dataset. + +The following example demonstrates how the model tracking feature can be used for an experiment that trains a few binary-classifier models on the Titanic dataset. First, we must load the titanic data into our database and store it as a virtual dataframe (vDF): .. ipython:: python :okwarning: @@ -61,7 +27,7 @@ database and store it as a virtual dataframe (vDF): predictors = ["age", "fare", "pclass"] response = "survived" -We then define a vExperiment object to track the candidate models. To define the experiment object, specify the following parameters: +We then define a ``vExperiment`` object to track the candidate models. To define the experiment object, specify the following parameters: - experiment_name: The name of the experiment. - test_relation: Relation or vDF to use to test the model. @@ -72,22 +38,8 @@ We then define a vExperiment object to track the candidate models. To define the The following parameters are optional: -- experiment_type: By default ``auto``, meaning VerticaPy tries to detect the -experiment type from the response value. However, it might be cleaner -to explicitly specify the experiment type. -The other valid values for this parameter are ``regressor`` -(for regression models), ``binary`` (for binary classification models), -``multi`` (for multiclass classification models), and ``clustering`` -(for clustering models). -- experiment_table: The name of the table ([schema_name.]table_name) -in the database to archive the experiment. The experiment information -won't be backed up in the database without specifying this -parameter. If the table already exists, its previously stored -experiments are loaded to the object. In this case, the user -must have ``SELECT``, ``INSERT``, and ``DELETE`` privileges -on the table. If the table doesn``t exist and the user has -the necessary privileges for creating such a table, the table is created. - +- experiment_type: By default ``auto``, meaning VerticaPy tries to detect the experiment type from the response value. However, it might be cleaner to explicitly specify the experiment type. The other valid values for this parameter are ``regressor`` (for regression models), ``binary`` (for binary classification models), ``multi`` (for multiclass classification models), and ``clustering`` (for clustering models). +- experiment_table: The name of the table ([schema_name.]table_name) in the database to archive the experiment. The experiment information won't be backed up in the database without specifying this parameter. If the table already exists, its previously stored experiments are loaded to the object. In this case, the user must have ``SELECT``, ``INSERT``, and ``DELETE`` privileges on the table. If the table doesn't exist and the user has the necessary privileges for creating such a table, the table is created. .. ipython:: python :okwarning: @@ -100,46 +52,41 @@ the necessary privileges for creating such a table, the table is created. X=predictors, y=response, experiment_type="binary", - experiment_table="my_exp_table_1" + experiment_table="my_exp_table_1", ) - -After creating the experiment object, we can train -different models and add them to the experiment: +After creating the experiment object, we can train different models and add them to the experiment: .. ipython:: python :okwarning: # training a LogisticRegression model from verticapy.machine_learning.vertica import LogisticRegression - model_1 = LogisticRegression("logistic_reg_m", overwrite_model=True) + + model_1 = LogisticRegression("logistic_reg_m", overwrite_model = True) model_1.fit(titanic_vDF, predictors, response) my_experiment_1.add_model(model_1) # training a LinearSVC model - from verticapy.machine_learning.vertica.svm import LinearSVC - model_2 = LinearSVC("svc_m", overwrite_model=True) + from verticapy.machine_learning.vertica import LinearSVC + + model_2 = LinearSVC("svc_m", overwrite_model = True) model_2.fit(titanic_vDF, predictors, response) my_experiment_1.add_model(model_2) # training a DecisionTreeClassifier model - from verticapy.machine_learning.vertica.tree import DecisionTreeClassifier - model_3 = DecisionTreeClassifier("tree_m", overwrite_model=True, max_depth=3) + from verticapy.machine_learning.vertica import DecisionTreeClassifier + + model_3 = DecisionTreeClassifier("tree_m", overwrite_model = True, max_depth = 3) model_3.fit(titanic_vDF, predictors, response) my_experiment_1.add_model(model_3) -So far we have only added three models to the experiment, -but we could add many more in a real scenario. Using -the experiment object, we can easily list the models -in the experiment and pick the one with the best prediction -performance based on a specified metric. - +So far we have only added three models to the experiment, but we could add many more in a real scenario. Using the experiment object, we can easily list the models in the experiment and pick the one with the best prediction performance based on a specified metric. .. code-block:: python my_experiment_1.list_models() - .. ipython:: python :suppress: :okwarning: @@ -152,31 +99,20 @@ performance based on a specified metric. .. raw:: html :file: /project/data/VerticaPy/docs/figures/ug_ml_model_tracking_list_models.html - .. ipython:: python - top_model = my_experiment_1.load_best_model(metric="auc") - - + top_model = my_experiment_1.load_best_model(metric = "auc") -The experiment object facilitates not only model tracking -but also makes cleanup super easy, especially in real-world -scenarios where there is often a large number of leftover -models. The ``drop`` method drops from the database the info -of the experiment and all associated models other than -those specified in the keeping_models list. +The experiment object facilitates not only model tracking but also makes cleanup super easy, especially in real-world +scenarios where there is often a large number of leftover models. The ``drop`` method drops from the database the info of the experiment and all associated models other than those specified in the keeping_models list. .. ipython:: python :okwarning: my_experiment_1.drop(keeping_models=[top_model.model_name]) - - -Experiments are also helpful for performing grid search on -hyper-parameters. The following example shows how they can -be used to study the impact of the max_iter parameter on -the prediction performance of LogisticRegression models. +Experiments are also helpful for performing grid search on hyper-parameters. The following example shows how they can +be used to study the impact of the max_iter parameter on the prediction performance of ``LogisticRegression`` models. .. ipython:: python :okwarning: @@ -185,14 +121,14 @@ the prediction performance of LogisticRegression models. my_experiment_2 = mt.vExperiment( experiment_name = "my_exp_2", test_relation = titanic_vDF, - X=predictors, - y=response, - experiment_type="binary" + X = predictors, + y = response, + experiment_type = "binary", ) # training LogisticRegression with different values of max_iter for i in range(1, 5): - model = LogisticRegression(max_iter=i) + model = LogisticRegression(max_iter = i) model.fit(titanic_vDF, predictors, response) my_experiment_2.add_model(model) @@ -202,39 +138,21 @@ the prediction performance of LogisticRegression models. # cleaning all the models associated to the experimen from the database my_experiment_2.drop() - Model Versioning ----------------- -In Vertica version 12.0.4, we added support for In-DB ML Model -Versioning. Now, we have integrated it into VerticaPy -so that users can utilize its capabilities along with the -other tools in VerticaPy. In VerticaPy, model versioning -is a wrapper around an SQL API already built in Vertica. -For more information about the concepts of model versioning -in Vertica, see the Vertica documentation. +In Vertica version 12.0.4, we added support for In-DB ML Model Versioning. Now, we have integrated it into VerticaPy so that users can utilize its capabilities along with the other tools in VerticaPy. In VerticaPy, model versioning is a wrapper around an SQL API already built in Vertica. For more information about the concepts of model versioning in Vertica, see the Vertica documentation. -To showcase model versioning, we will begin by registering -the ``top_model`` picked from the above experiment. +To showcase model versioning, we will begin by registering the ``top_model`` picked from the above experiment. .. ipython:: python :okwarning: top_model.register("top_model_demo") +When the model owner registers the model, its ownership changes to ``DBADMIN``, and the previous owner receives ``USAGE`` privileges. Registered models are referred to by their registered_name and version. Only DBADMIN or a user with the MLSUPERVISOR role can change the status of a registered model. We have provided the ``RegisteredModel`` class in VerticaPy for working with registered models. -When the model owner registers the model, its ownership -changes to ``DBADMIN``, and the previous owner receives -``USAGE`` privileges. Registered models are referred to -by their registered_name and version. Only DBADMIN or a -user with the MLSUPERVISOR role can change the status -of a registered model. We have provided the ``RegisteredModel`` -class in VerticaPy for working with registered models. - -We will now make a ``RegisteredModel`` object for our -recently registered model and change its status to -"production". We can then use the registered model -for scoring. +We will now make a ``RegisteredModel`` object for our recently registered model and change its status to "production". We can then use the registered model for scoring. .. ipython:: python @@ -242,15 +160,12 @@ for scoring. rm = mv.RegisteredModel("top_model_demo") -To see the list of all models registered as ``top_model_demo``, -use the ``list_models`` method. - +To see the list of all models registered as ``top_model_demo``, use the ``list_models`` method. .. code-block:: python rm.list_models() - .. ipython:: python :suppress: :okwarning: @@ -263,41 +178,37 @@ use the ``list_models`` method. .. raw:: html :file: /project/data/VerticaPy/docs/figures/ug_ml_model_tracking_list_models_2.html -The model we just registered has a status of "under_review". -The next step is to change the status of the model to -"staging", which is meant for A/B testing the model. -Assuming the model performs well, we will promote it -to the "production" status. Please note that we should -specify the right version of the registered model from the -above table. +The model we just registered has a status of "under_review". The next step is to change the status of the model to "staging", which is meant for A/B testing the model. Assuming the model performs well, we will promote it to the "production" status. Please note that we should specify the right version of the registered model from the above table. .. ipython:: python :okwarning: + # Getting the current version + version = rm.list_models()["registered_version"][0] + # changing the status of the model to staging - rm.change_status(version=rm.list_models()["registered_version"][0], new_status="staging") + rm.change_status(version = version, new_status = "staging") # changing the status of the model to production - rm.change_status(version=rm.list_models()["registered_version"][0], new_status="production") + rm.change_status(version = version, new_status = "production") +There can only be one version of the registered model in "production" at any time. The following predict function applies to the model with "production" status by default. - -There can only be one version of the registered model in -"production" at any time. The following predict function -applies to the model with "production" status by default. -If you want to run the predict function on a model with a -status other than "production", you must also specify the -model version. +If you want to run the predict function on a model with a status other than "production", you must also specify the model version. .. code-block:: python - rm.predict(titanic_vDF, X=predictors, name="predicted_value") + rm.predict( + titanic_vDF, + X = predictors, + name = "predicted_value", + ) .. ipython:: python :suppress: :okwarning: - res = rm.predict(titanic_vDF, X=predictors, name="predicted_value") + res = rm.predict(titanic_vDF, X = predictors, name = "predicted_value") html_file = open("/project/data/VerticaPy/docs/figures/ug_ml_model_tracking_predict.html", "w") html_file.write(res._repr_html_()) html_file.close() @@ -305,9 +216,7 @@ model version. .. raw:: html :file: /project/data/VerticaPy/docs/figures/ug_ml_model_tracking_predict.html -``DBADMIN`` and users who are granted ``SELECT`` privileges on the -``v_monitor.model_status_history`` table are able to monitor -the status history of registered models. +``DBADMIN`` and users who are granted ``SELECT`` privileges on the ``v_monitor.model_status_history`` table are able to monitor the status history of registered models. .. code-block:: python @@ -329,10 +238,4 @@ Conclusion ----------- -The addition of model tracking and model versioning to the -VerticaPy toolkit greatly improves VerticaPy's MLOps capabilities. -We are constantly working to improve VerticaPy and address -the needs of data scientists who wish to harness the power -of Vertica database to empower their data analyses. If you -have any comments or questions, don't hesitate to reach -out in the VerticaPy github community. \ No newline at end of file +The addition of model tracking and model versioning to the VerticaPy toolkit greatly improves VerticaPy's MLOps capabilities. We are constantly working to improve VerticaPy and address the needs of data scientists who wish to harness the power of Vertica database to empower their data analyses. If you have any comments or questions, don't hesitate to reach out in the VerticaPy github community. \ No newline at end of file