From db9eedbd60e8d1b6419b806b386c6aa0d654188e Mon Sep 17 00:00:00 2001 From: Umar Farooq Ghumman Date: Fri, 25 Oct 2024 14:41:24 -0500 Subject: [PATCH] removed default directory -> SPHINX_DIRECTORY --- .../source/examples_business_base_station.rst | 50 ++--- docs/source/examples_business_battery.rst | 54 ++--- docs/source/examples_business_booking.rst | 46 ++-- docs/source/examples_business_churn.rst | 42 ++-- .../examples_business_credit_card_fraud.rst | 78 +++---- docs/source/examples_business_football.rst | 196 +++++++++--------- docs/source/examples_business_insurance.rst | 74 +++---- docs/source/examples_business_movies.rst | 112 +++++----- .../source/examples_business_smart_meters.rst | 114 +++++----- docs/source/examples_business_spam.rst | 34 +-- docs/source/examples_business_spotify.rst | 80 +++---- docs/source/examples_learn_commodities.rst | 72 +++---- docs/source/examples_learn_iris.rst | 40 ++-- docs/source/examples_learn_pokemon.rst | 40 ++-- docs/source/examples_learn_titanic.rst | 72 +++---- docs/source/examples_learn_winequality.rst | 28 +-- .../examples_understand_africa_education.rst | 68 +++--- docs/source/examples_understand_amazon.rst | 36 ++-- docs/source/examples_understand_covid19.rst | 60 +++--- .../user_guide_data_exploration_charts.rst | 52 ++--- ...er_guide_data_exploration_correlations.rst | 58 +++--- ...ata_exploration_descriptive_statistics.rst | 30 +-- docs/source/user_guide_data_ingestion.rst | 4 +- ...r_guide_data_preparation_decomposition.rst | 16 +- ...user_guide_data_preparation_duplicates.rst | 12 +- .../user_guide_data_preparation_encoding.rst | 32 +-- ..._data_preparation_features_engineering.rst | 28 +-- .../user_guide_data_preparation_joins.rst | 30 +-- ..._guide_data_preparation_missing_values.rst | 20 +- ...r_guide_data_preparation_normalization.rst | 12 +- .../user_guide_data_preparation_outliers.rst | 38 ++-- ...ser_guide_full_stack_complex_data_vmap.rst | 56 ++--- ...er_guide_full_stack_dblink_integration.rst | 60 +++--- ...ser_guide_full_stack_linear_regression.rst | 80 +++---- docs/source/user_guide_full_stack_to_json.rst | 8 +- docs/source/user_guide_introduction_vdf.rst | 36 ++-- ..._guide_machine_learning_classification.rst | 12 +- ...user_guide_machine_learning_clustering.rst | 8 +- ...er_guide_machine_learning_introduction.rst | 18 +- ..._guide_machine_learning_model_tracking.rst | 16 +- ...user_guide_machine_learning_regression.rst | 10 +- ...ser_guide_machine_learning_time_series.rst | 20 +- docs/source/user_guide_performance_qprof.rst | 50 ++--- 43 files changed, 1001 insertions(+), 1001 deletions(-) diff --git a/docs/source/examples_business_base_station.rst b/docs/source/examples_business_base_station.rst index 00db62dd7..a728b8be0 100644 --- a/docs/source/examples_business_base_station.rst +++ b/docs/source/examples_business_base_station.rst @@ -108,7 +108,7 @@ Let's load the two datasets. from verticapy.datasets import load_world matplotlib.rcParams['animation.embed_limit'] = 2 ** 128 cdr = vp.read_csv( - "/project/data/VerticaPy/docs/source/_static/website/examples/data/base_station/shanghai_cdr.csv", + "SPHINX_DIRECTORY/source/_static/website/examples/data/base_station/shanghai_cdr.csv", schema = "shanghai", table_name = "cdr", sep = ",", @@ -117,7 +117,7 @@ Let's load the two datasets. # Unique Row id: It will be used to compute the Polygons intersection cdr["row_id"] = "ROW_NUMBER() OVER(ORDER BY user_id, start_time)" shanghai_districts = vp.read_csv( - "/project/data/VerticaPy/docs/source/_static/website/examples/data/base_station/shanghai_districts.csv", + "SPHINX_DIRECTORY/source/_static/website/examples/data/base_station/shanghai_districts.csv", schema = "shanghai", table_name = "districts", sep = ",", @@ -142,13 +142,13 @@ These datasets contain the following: :suppress: res = cdr.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_base_station_cdr_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_base_station_cdr_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_base_station_cdr_head.html + :file: SPHINX_DIRECTORY/figures/examples_base_station_cdr_head.html .. code-block:: python @@ -158,13 +158,13 @@ These datasets contain the following: :suppress: res = shanghai_districts.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_base_station_shanghai_district_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_base_station_shanghai_district_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_base_station_shanghai_district_head.html + :file: SPHINX_DIRECTORY/figures/examples_base_station_shanghai_district_head.html Data Exploration ---------------- @@ -313,7 +313,7 @@ Let's examine the network activity of each of our districts. To do this, we need y = "latitude", ) res = intersect_districts_cdr.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_base_station_shanghai_district_activity.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_base_station_shanghai_district_activity.html", "w") html_file.write(res._repr_html_()) html_file.close() @@ -392,7 +392,7 @@ Let's examine the network activity of each of our districts. To do this, we need file.write(fig.__html__()) .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_base_station_animated_bar_activity.html + :file: SPHINX_DIRECTORY/figures/examples_base_station_animated_bar_activity.html Like you might expect, Shanghai's downtown is the most active one for the selected period. @@ -435,13 +435,13 @@ We create virtual base stations by grouping existing base stations in 100 cluste :suppress: res = model.predict(bs_xy, name = "cluster") - html_file = open("/project/data/VerticaPy/docs/figures/examples_base_station_model_rediction.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_base_station_model_rediction.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_base_station_model_rediction.html + :file: SPHINX_DIRECTORY/figures/examples_base_station_model_rediction.html .. ipython:: python @@ -460,10 +460,10 @@ We create virtual base stations by grouping existing base stations in 100 cluste vp.set_option("plotting_lib","plotly") fig = model.plot_voronoi(plot_crosses = False) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_base_station_voronoi_plotly.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_base_station_voronoi_plotly.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_base_station_voronoi_plotly.html + :file: SPHINX_DIRECTORY/figures/examples_base_station_voronoi_plotly.html In this figure, each Voronoi cell represents a base station cluster. @@ -525,13 +525,13 @@ Workload is defined as the number of connections per time interval. To find the :suppress: res = cdr_sample.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_base_station_cdr_sample_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_base_station_cdr_sample_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_base_station_cdr_sample_head.html + :file: SPHINX_DIRECTORY/figures/examples_base_station_cdr_sample_head.html @@ -589,7 +589,7 @@ Workload is defined as the number of connections per time interval. To find the file.write(fig.__html__()) .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_base_station_animated_scatter_longi.html + :file: SPHINX_DIRECTORY/figures/examples_base_station_animated_scatter_longi.html From the above animation, we can see that we'll typically have unconnected base stations and that the most overloaded base stations are located around the downtown. @@ -622,12 +622,12 @@ We can then calculate the workload for each cluster. :suppress: res = bs_workload_90 - html_file = open("/project/data/VerticaPy/docs/figures/examples_base_station_bs_workload_90.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_base_station_bs_workload_90.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_base_station_bs_workload_90.html + :file: SPHINX_DIRECTORY/figures/examples_base_station_bs_workload_90.html .. ipython:: python @@ -663,12 +663,12 @@ We can then calculate the workload for each cluster. :suppress: res = cworkload_bs - html_file = open("/project/data/VerticaPy/docs/figures/examples_base_station_cworkload_bs.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_base_station_cworkload_bs.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_base_station_cworkload_bs.html + :file: SPHINX_DIRECTORY/figures/examples_base_station_cworkload_bs.html Data Modeling -------------- @@ -720,10 +720,10 @@ Let's find a suitable number of clusters using elbow curve. from verticapy.machine_learning.model_selection import elbow vp.set_option("plotting_lib", "plotly") fig = elbow(bs_xy, ["longitude", "latitude"]) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_base_station_elbow_longi_lati.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_base_station_elbow_longi_lati.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_base_station_elbow_longi_lati.html + :file: SPHINX_DIRECTORY/figures/examples_base_station_elbow_longi_lati.html The :py:func:`~verticapy.machine_learning.model_selection.elbow` curve seems to indicate that 4 would be a good number of clusters, so let's try k = 4 and view the weighted ``k-means`` algorithm's suggested positions for new base stations based on the centers of the clusters. @@ -813,13 +813,13 @@ The features used to train our model will be longitude, latitude, total number o :suppress: res = bs_metrics - html_file = open("/project/data/VerticaPy/docs/figures/examples_base_station_bs_metrics.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_base_station_bs_metrics.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_base_station_bs_metrics.html + :file: SPHINX_DIRECTORY/figures/examples_base_station_bs_metrics.html .. ipython:: python @@ -862,10 +862,10 @@ The features used to train our model will be longitude, latitude, total number o vp.set_option("plotting_lib","plotly") fig = model.plot() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_base_station_auto_ml_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_base_station_auto_ml_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_base_station_auto_ml_plot.html + :file: SPHINX_DIRECTORY/figures/examples_base_station_auto_ml_plot.html Conclusion ----------- diff --git a/docs/source/examples_business_battery.rst b/docs/source/examples_business_battery.rst index 7d7050285..b2f59ddb8 100644 --- a/docs/source/examples_business_battery.rst +++ b/docs/source/examples_business_battery.rst @@ -82,14 +82,14 @@ Let's examine our data. Here, we use :py:func:`~verticapy.vDataFrame.head` to re .. ipython:: python :suppress: - battery5 = vp.read_csv("/project/data/VerticaPy/docs/source/_static/website/examples/data/battery/data.csv",) + battery5 = vp.read_csv("SPHINX_DIRECTORY/source/_static/website/examples/data/battery/data.csv",) res = battery5 - html_file = open("/project/data/VerticaPy/docs/figures/examples_battery_table_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_battery_table_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_battery_table_head.html + :file: SPHINX_DIRECTORY/figures/examples_battery_table_head.html Let's perform a few aggregations with :py:func:`~verticapy.vDataFrame.describe` to get a high-level overview of the dataset. @@ -101,12 +101,12 @@ Let's perform a few aggregations with :py:func:`~verticapy.vDataFrame.describe` :suppress: res = battery5.describe() - html_file = open("/project/data/VerticaPy/docs/figures/examples_battery_table_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_battery_table_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_battery_table_describe.html + :file: SPHINX_DIRECTORY/figures/examples_battery_table_describe.html To get a better idea of the changes between each cycle, we look at an aggregation at their start time, duration, and voltage at the beginning and the end of each cycle. @@ -118,12 +118,12 @@ To get a better idea of the changes between each cycle, we look at an aggregatio :suppress: res = battery5["start_time"].describe() - html_file = open("/project/data/VerticaPy/docs/figures/examples_battery__start_time_table_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_battery__start_time_table_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_battery__start_time_table_describe.html + :file: SPHINX_DIRECTORY/figures/examples_battery__start_time_table_describe.html To see how the voltage changes during the cycle, we extract the initial and final voltage measurements for each cycle. @@ -189,12 +189,12 @@ To see how the voltage changes during the cycle, we extract the initial and fina ).sort("start_time") cycling_info["cycle_id"] = "ROW_NUMBER() OVER(ORDER BY start_time)" res = cycling_info.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_battery_cycling_info.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_battery_cycling_info.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_battery_cycling_info.html + :file: SPHINX_DIRECTORY/figures/examples_battery_cycling_info.html We can see from the "duration" column that charging seems to take a longer time than discharging. Let's visualize this trend with an animated graph. @@ -213,12 +213,12 @@ Let's visualize this trend with an animated graph. import warnings warnings.filterwarnings("ignore") res = cycling_info.animated_bar(ts = "start_time",columns = ["type", "cycle_duration"]) - html_file = open("/project/data/VerticaPy/docs/figures/examples_battery_animated_bar.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_battery_animated_bar.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_battery_animated_bar.html + :file: SPHINX_DIRECTORY/figures/examples_battery_animated_bar.html The animated graph below shows how the cycles change throughout time. Another way we can verify that charging cycles are longer than discharging cycles is by looking at the average duration of each type of cycle. @@ -236,10 +236,10 @@ The animated graph below shows how the cycles change throughout time. Another wa import verticapy verticapy.set_option("plotting_lib", "plotly") fig = cycling_info.bar(["type"], method = "avg", of = "cycle_duration") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_battery_bar_type.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_battery_bar_type.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_battery_bar_type.html + :file: SPHINX_DIRECTORY/figures/examples_battery_bar_type.html In general, charging cycles are longer than discharging cycles. Let's examine how voltage changes between cycles and their transitions. @@ -275,12 +275,12 @@ Let's examine how voltage changes between cycles and their transitions. ], ) res = cycling_info.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_battery_cycling_info_after_groupby.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_battery_cycling_info_after_groupby.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_battery_cycling_info_after_groupby.html + :file: SPHINX_DIRECTORY/figures/examples_battery_cycling_info_after_groupby.html From this table, it looks like batteries are charged until they are almost full (4.2V) and discharging doesn't begin until they are fully charged. @@ -315,12 +315,12 @@ But first we need to perform some preprocessing. window = (-100, -1), name = "smooth_capacity", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_battery_cycling_info_after_rollign_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_battery_cycling_info_after_rollign_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_battery_cycling_info_after_rollign_2.html + :file: SPHINX_DIRECTORY/figures/examples_battery_cycling_info_after_rollign_2.html Now we can plot the graphs. In VerticaPy we have multiple options to plot the graphs with different syntax of customization. For a complete list of all the graphs and their options check out the :ref:`chart_gallery`. @@ -389,10 +389,10 @@ We can now try to plot it using Plotly. We can conveniently switch between the p # Add legend for the horizontal line plot.add_trace(go.Scatter(x = [None], y = [None], mode = "lines", line = dict(color="green", width=3, dash="dash"), name = "End-of-life criteria")) fig = plot - fig.write_html("/project/data/VerticaPy/docs/figures/examples_battery_discharge_plotly_plote.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_battery_discharge_plotly_plote.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_battery_discharge_plotly_plote.html + :file: SPHINX_DIRECTORY/figures/examples_battery_discharge_plotly_plote.html The sudden increases in battery capacity come from the self-charging property of Li-ion batteries. The smoothed graph makes the downward trend in the battery's capacity very clear. @@ -465,10 +465,10 @@ Since measurements like voltage and temperature tend to differ within the differ sample_cycle = battery5[battery5["Capacity"] == "1.83514614292266"] sample_cycle["Voltage_measured"].plot(ts = "Time") fig = sample_cycle["Temperature_measured"].plot(ts = "Time") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_battery_temp_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_battery_temp_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_battery_temp_plot.html + :file: SPHINX_DIRECTORY/figures/examples_battery_temp_plot.html We'll define new features that describe the minimum and maximum temperature during one cycle; the minimal voltage; and the time needed to reach minimum voltage and maximum temperature. @@ -657,10 +657,10 @@ We can visualize the performance and efficency differences of each model with a :okwarning: fig = model.plot() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_battery_auto_ml_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_battery_auto_ml_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_battery_auto_ml_plot.html + :file: SPHINX_DIRECTORY/figures/examples_battery_auto_ml_plot.html .. ipython:: python @@ -721,12 +721,12 @@ We can now define the model using those hyperparameters and train it. :suppress: res = model_rf.regression_report() - html_file = open("/project/data/VerticaPy/docs/figures/examples_battery_reg_reprot.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_battery_reg_reprot.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_battery_reg_reprot.html + :file: SPHINX_DIRECTORY/figures/examples_battery_reg_reprot.html The predictive power of our model looks pretty good. Let's use our model to predict the SoH of the battery. We can visualize our prediction with a plot against the true values. @@ -754,10 +754,10 @@ The predictive power of our model looks pretty good. Let's use our model to pred ts = "start_time", columns = ["SOH", "SOH_estimates"], ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_battery_auto_ml_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_battery_auto_ml_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_battery_auto_ml_plot.html + :file: SPHINX_DIRECTORY/figures/examples_battery_auto_ml_plot.html Conclusion ----------- diff --git a/docs/source/examples_business_booking.rst b/docs/source/examples_business_booking.rst index 531589f15..638f148c5 100644 --- a/docs/source/examples_business_booking.rst +++ b/docs/source/examples_business_booking.rst @@ -59,14 +59,14 @@ Let's create a Virtual DataFrame of the dataset. .. ipython:: python :suppress: - expedia = vp.read_csv("/project/data/VerticaPy/docs/source/_static/website/examples/data/booking/expedia.csv") + expedia = vp.read_csv("SPHINX_DIRECTORY/source/_static/website/examples/data/booking/expedia.csv") res = expedia.head(5) - html_file = open("/project/data/VerticaPy/docs/figures/examples_expedia_table_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_expedia_table_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_expedia_table_head.html + :file: SPHINX_DIRECTORY/figures/examples_expedia_table_head.html .. warning:: @@ -97,12 +97,12 @@ We start by using the :py:func:`~verticapy.vDataFrame.sessionize` method to crea session_threshold = "30 minutes", name = "session_id", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_expedia_sessionize.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_expedia_sessionize.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_expedia_sessionize.html + :file: SPHINX_DIRECTORY/figures/examples_expedia_sessionize.html The duration of the trip should also influence/be indicative of the user's behavior on the site, so we'll take that into account. @@ -138,12 +138,12 @@ If a user looks at the same hotel several times, then it might mean that they're name = "mode_hotel_cluster", add_count = True, ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_expedia_analytic.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_expedia_analytic.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_expedia_analytic.html + :file: SPHINX_DIRECTORY/figures/examples_expedia_analytic.html We can now aggregate the session and get some useful statistics out of it: - **end_session_date_time:** Date and time when the session ends. @@ -186,12 +186,12 @@ Let's look at the missing values. :suppress: res = expedia.count_percent() - html_file = open("/project/data/VerticaPy/docs/figures/examples_expedia_count_percent.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_expedia_count_percent.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_expedia_count_percent.html + :file: SPHINX_DIRECTORY/figures/examples_expedia_count_percent.html Let's impute the missing values for 'avg_distance' and 'trip_duration'. @@ -205,12 +205,12 @@ Let's impute the missing values for 'avg_distance' and 'trip_duration'. expedia["avg_distance" ].fillna(method = "avg") res = expedia["trip_duration"].fillna(method = "avg") - html_file = open("/project/data/VerticaPy/docs/figures/examples_expedia_fillna_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_expedia_fillna_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_expedia_fillna_1.html + :file: SPHINX_DIRECTORY/figures/examples_expedia_fillna_1.html We can then look at the links between the variables. We will use Spearman's rank correleation coefficient to get all the monotonic relationships. @@ -224,10 +224,10 @@ We can then look at the links between the variables. We will use Spearman's rank import verticapy verticapy.set_option("plotting_lib", "plotly") fig = expedia.corr(method = "spearman") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_expedia_corr.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_expedia_corr.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_expedia_corr.html + :file: SPHINX_DIRECTORY/figures/examples_expedia_corr.html We can see huge links between some of the variables ('mode_hotel_cluster_count' and 'session_duration') and our response variable ('is_booking'). A logistic regression would work well in this case because the response and predictors have a monotonic relationship. @@ -268,10 +268,10 @@ None of our coefficients are rejected (``pvalue = 0``). Let's look at their impo :suppress: fig = model_logit.features_importance() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_expedia_features_importance.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_expedia_features_importance.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_expedia_features_importance.html + :file: SPHINX_DIRECTORY/figures/examples_expedia_features_importance.html It looks like there are two main predictors: 'mode_hotel_cluster_count' and 'trip_duration'. According to our model, users likely to make a booking during a particular session will tend to: @@ -297,12 +297,12 @@ Let's add our prediction to the :py:mod:`~verticapy.vDataFrame`. name = "booking_prob_logit", pos_label = 1, ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_expedia_predict_proba_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_expedia_predict_proba_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_expedia_predict_proba_1.html + :file: SPHINX_DIRECTORY/figures/examples_expedia_predict_proba_1.html While analyzing the following boxplot (prediction partitioned by 'is_booking'), we can notice that the ``cutoff`` is around 0.22 because most of the positive predictions have a probability between 0.23 and 0.5. Most of the negative predictions are between 0.05 and 0.2. @@ -315,10 +315,10 @@ While analyzing the following boxplot (prediction partitioned by 'is_booking'), :okwarning: fig = expedia["booking_prob_logit"].boxplot(by = "is_booking") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_expedia_predict_boxplot_1.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_expedia_predict_boxplot_1.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_expedia_predict_boxplot_1.html + :file: SPHINX_DIRECTORY/figures/examples_expedia_predict_boxplot_1.html Let's confirm our hypothesis by computing the best ``cutoff``. @@ -336,12 +336,12 @@ Let's look at the efficiency of our model with a cutoff of 0.22. :suppress: res = model_logit.report(cutoff = 0.22) - html_file = open("/project/data/VerticaPy/docs/figures/examples_expedia_cutoff_best.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_expedia_cutoff_best.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_expedia_cutoff_best.html + :file: SPHINX_DIRECTORY/figures/examples_expedia_cutoff_best.html ROC Curve: +++++++++++ @@ -354,10 +354,10 @@ ROC Curve: :suppress: fig = model_logit.roc_curve() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_expedia_roc_curve_1.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_expedia_roc_curve_1.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_expedia_roc_curve_1.html + :file: SPHINX_DIRECTORY/figures/examples_expedia_roc_curve_1.html We're left with an excellent model. With this, we can predict whether a user will book a hotel during a specific session and make adjustments to our site accordingly. For example, to influence a user to make a booking, we could propose new hotels. diff --git a/docs/source/examples_business_churn.rst b/docs/source/examples_business_churn.rst index b75af1afc..4eaa29099 100644 --- a/docs/source/examples_business_churn.rst +++ b/docs/source/examples_business_churn.rst @@ -47,15 +47,15 @@ Let's take a look at the first few entries in the dataset. :suppress: churn = vp.read_csv( - "/project/data/VerticaPy/docs/source/_static/website/examples/data/churn/customers.csv", + "SPHINX_DIRECTORY/source/_static/website/examples/data/churn/customers.csv", ) res = churn.head(10) - html_file = open("/project/data/VerticaPy/docs/figures/examples_churn_table.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_churn_table.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_churn_table.html + :file: SPHINX_DIRECTORY/figures/examples_churn_table.html Data Exploration and Preparation --------------------------------- @@ -70,12 +70,12 @@ Let's examine our data. :suppress: res = churn.describe(method = "categorical", unique = True) - html_file = open("/project/data/VerticaPy/docs/figures/examples_churn_table_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_churn_table_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_churn_table_describe.html + :file: SPHINX_DIRECTORY/figures/examples_churn_table_describe.html Several variables are categorical, and since they all have low cardinalities, we can compute their dummies. We can also convert all booleans to numeric. @@ -134,12 +134,12 @@ Several variables are categorical, and since they all have low cardinalities, we "InternetService", ], ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_insurance_table_clean_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_insurance_table_clean_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_table_clean_1.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_table_clean_1.html Let's compute the correlations between the different variables and the response column. @@ -154,10 +154,10 @@ Let's compute the correlations between the different variables and the response import verticapy verticapy.set_option("plotting_lib", "plotly") fig = churn.corr(focus = "Churn") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_churn_corr.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_churn_corr.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_churn_corr.html + :file: SPHINX_DIRECTORY/figures/examples_churn_corr.html Many features have a strong correlation with the 'Churn' variable. For example, the customers that have a 'Month to Month' contract are more likely to churn. Having this type of contract gives customers a lot of flexibility and allows them to leave at any time. New customers are also likely to churn. @@ -173,10 +173,10 @@ Many features have a strong correlation with the 'Churn' variable. For example, import verticapy verticapy.set_option("plotting_lib", "plotly") fig = churn.barh(["Contract_Month-to-month", "tenure"], method = "avg", of = "Churn", height = 500) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_churn_barh.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_churn_barh.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_churn_barh.html + :file: SPHINX_DIRECTORY/figures/examples_churn_barh.html The following scatter plot shows that providing better tariff plans can prevent churning. Indeed, customers having high total charges are more likely to churn even if they've been with the company for a long time. @@ -191,10 +191,10 @@ The following scatter plot shows that providing better tariff plans can prevent import verticapy verticapy.set_option("plotting_lib", "plotly") fig = churn.scatter(["TotalCharges", "tenure"], by = "Churn") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_churn_scatter.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_churn_scatter.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_churn_scatter.html + :file: SPHINX_DIRECTORY/figures/examples_churn_scatter.html Let's move on to machine learning. @@ -251,12 +251,12 @@ Let's train and evaluate our model. test, ) res = model.classification_report() - html_file = open("/project/data/VerticaPy/docs/figures/examples_churn_table_report.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_churn_table_report.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_churn_table_report.html + :file: SPHINX_DIRECTORY/figures/examples_churn_table_report.html The model is excellent! Let's run some machine learning on the entire dataset and compute the importance of each feature. @@ -282,10 +282,10 @@ The model is excellent! Let's run some machine learning on the entire dataset an "churn", ) fig = model.features_importance() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_churn_features_importance.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_churn_features_importance.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_churn_features_importance.html + :file: SPHINX_DIRECTORY/figures/examples_churn_features_importance.html Based on our model, most churning customers are at least one of the following: @@ -326,12 +326,12 @@ Notice that customers have a 'Fiber Optic' option are also likely to churn. Let' fun.avg(churn["monthlycharges"])._as("monthlycharges"), ] ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_churn_table_groupby.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_churn_table_groupby.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_churn_table_groupby.html + :file: SPHINX_DIRECTORY/figures/examples_churn_table_groupby.html It seems like the 'Fiber Optic' option in and of itself doesn't lead to churning, but customers that have this option tend to churn because their contract puts them into one of the three categories we listed before: they're paying more. @@ -349,10 +349,10 @@ We'll use a lift chart to help us identify which of our customers are likely to import verticapy verticapy.set_option("plotting_lib", "plotly") fig = model.lift_chart() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_churn_lift_chart.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_churn_lift_chart.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_churn_lift_chart.html + :file: SPHINX_DIRECTORY/figures/examples_churn_lift_chart.html By targeting less than 30% of the entire distribution, our predictions will be more than three times more accurate than the other 70%. diff --git a/docs/source/examples_business_credit_card_fraud.rst b/docs/source/examples_business_credit_card_fraud.rst index 437d1379f..3d504dad9 100644 --- a/docs/source/examples_business_credit_card_fraud.rst +++ b/docs/source/examples_business_credit_card_fraud.rst @@ -52,14 +52,14 @@ Let's create a Virtual DataFrame of the dataset. .. ipython:: python :suppress: - creditcard = vp.read_csv("/project/data/VerticaPy/docs/source/_static/website/examples/data/credit_card_fraud/creditcard.csv") + creditcard = vp.read_csv("SPHINX_DIRECTORY/source/_static/website/examples/data/credit_card_fraud/creditcard.csv") res = creditcard.head(5) - html_file = open("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_table_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_creditcardfraud_table_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_table_head.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_table_head.html .. warning:: @@ -78,12 +78,12 @@ Let's explore the data by displaying descriptive statistics of all the columns. :suppress: res = creditcard.describe() - html_file = open("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_creditcardfraud_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_describe.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_describe.html It'll be difficult to work on the principal components (V1 through V28) without knowing what they mean. The only features we can work on are 'Time' and 'Amount'. @@ -97,12 +97,12 @@ Let's convert the number of seconds elapsed to the correct date and time. We kno :suppress: res = creditcard["Time"].apply("TIMESTAMPADD(second, {}::int, '2013-09-01 00:00:00'::timestamp)") - html_file = open("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_apply.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_creditcardfraud_apply.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_apply.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_apply.html When performing machine learning, we'll take the data from two days and split it into a training set (first day) and a test set (second day). @@ -114,12 +114,12 @@ When performing machine learning, we'll take the data from two days and split it :suppress: res = creditcard["Time"].describe() - html_file = open("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_describe_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_creditcardfraud_describe_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_describe_2.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_describe_2.html Fraudulent activity probably isn't uniform across all hours of the day, so we'll extract the hour from the time and see how that influences the prediction. @@ -137,12 +137,12 @@ Fraudulent activity probably isn't uniform across all hours of the day, so we'll creditcard["hour"] = fun.hour(creditcard["Time"]) res = creditcard[["Time", "hour"]] - html_file = open("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_sample_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_creditcardfraud_sample_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_sample_1.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_sample_1.html We can visualize the frequency of fraudulent transactions throughout the day with a histogram. @@ -157,10 +157,10 @@ We can visualize the frequency of fraudulent transactions throughout the day wit import verticapy verticapy.set_option("plotting_lib", "plotly") fig = creditcard["hour"].hist(method = "avg", of = "Class") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_hist.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_creditcardfraud_hist.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_hist.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_hist.html It seems like most fraudulent activity happens at night. @@ -181,10 +181,10 @@ The transaction amount also likely differs between fraudulent and genuine transa method = "avg", of = "Amount", ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_bar.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_creditcardfraud_bar.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_bar.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_bar.html Let's create some new features and move forward from there. @@ -250,12 +250,12 @@ In lieu of customer IDs, we'll aggregate on the transaction amount over some par by = ["Amount"], order_by = ["Time"], ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_rolling.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_creditcardfraud_rolling.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_rolling.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_rolling.html As an aside, we could also create some features that represent different parts of the day, but won't be useful for our use case since we're only working with data for two days' worth of data. @@ -270,10 +270,10 @@ Let's look at the correlation matrix and see which features influence our predic :okwarning: fig = creditcard.corr(width = 800) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_corr_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_creditcardfraud_corr_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_corr_2.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_corr_2.html Our new features aren't linearly correlated with our response, but some of the components seem to have a large influence on our prediction. We'll use these when we create our model. @@ -303,12 +303,12 @@ To simplify things, let's save the dataset into a new table. relation_type = "table", inplace = True, ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_to_db.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_creditcardfraud_to_db.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_to_db.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_to_db.html Data Modeling -------------- @@ -362,12 +362,12 @@ Supervising would make this pretty easy since it would just be a binary classifi ) model.fit(train, predictors, response, test) res = model.classification_report() - html_file = open("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_classification_report.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_creditcardfraud_classification_report.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_classification_report.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_classification_report.html Based on the report, our model is very good at detecting non-fraudulent events; the AUC is high and the PRC AUC is very good. We can use this model to filter obvious events and to get some insight on the importance of each feature. @@ -380,10 +380,10 @@ Based on the report, our model is very good at detecting non-fraudulent events; :okwarning: fig = model.features_importance() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_features_importance_1.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_creditcardfraud_features_importance_1.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_features_importance_1.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_features_importance_1.html Some PCA components seem to be very relevant and will be essential for finding anomalies. @@ -423,10 +423,10 @@ Before using these techniques, let's draw some scatter plots to get a better ide by = "Class", max_nb_points = 5000000, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_ml_scatter_1.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_creditcardfraud_ml_scatter_1.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_ml_scatter_1.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_ml_scatter_1.html .. code-block:: python @@ -443,10 +443,10 @@ Before using these techniques, let's draw some scatter plots to get a better ide ["V12", "V17", "V10"], by = "Class", ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_ml_scatter_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_creditcardfraud_ml_scatter_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_ml_scatter_2.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_ml_scatter_2.html In this case, the anomalies seem pretty clear global outliers of the distributions. When doing unsupervised learning, we don't have this information in advance. @@ -488,10 +488,10 @@ Once we deploy the unsupervised model and can reliably detect suspicious transac ["V12", "V17", "V10", "V14", "V16"], n_cluster = [1, 2, 10, 20, 30], ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_ml_elbow.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_creditcardfraud_ml_elbow.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_ml_elbow.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_ml_elbow.html 10 seems to be a suitable number of clusters, so let's try out 20 clusters and see if the collective outliers cluster together. We can then then evaluate each cluster independently and see which clusters have the most anomalies. @@ -528,12 +528,12 @@ Let's direct our attention to the smallest clusters. "SUM(Class) / 492 AS total_fraud", ], ).sort("total") - html_file = open("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_groupby_ml.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_creditcardfraud_groupby_ml.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_groupby_ml.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_groupby_ml.html Notice that clusters with fewer elemenets tend to contain much more fraudulent events than the others. This methodology makes ``k-means`` a good algorithm for catching collective outliers. Combining ``k-means`` with other techniques like z-score, we can find most of the outliers of the distribution. @@ -574,12 +574,12 @@ Let's use the ``Z-score`` to detect global outliers of the distribution. "SUM(Class) / 492 AS total_fraud", ], ).sort("total") - html_file = open("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_groupby_2_ml.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_creditcardfraud_groupby_2_ml.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_groupby_2_ml.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_groupby_2_ml.html .. code-block:: python @@ -598,10 +598,10 @@ Let's use the ``Z-score`` to detect global outliers of the distribution. ["V12", "V17",], threshold = 5.0, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_ml_outliers_plot_3.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_creditcardfraud_ml_outliers_plot_3.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_ml_outliers_plot_3.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_ml_outliers_plot_3.html We can see that we can caught more than 71% of the fraudulent activity in less than 1% of the dataset. @@ -632,10 +632,10 @@ Other algorithms could be used to solve the problem with more precision if we co lof_creditcard = model.predict() lof_creditcard["outliers"] = "(CASE WHEN lof_score > 2 THEN 1 ELSE 0 END)" fig = lof_creditcard.scatter(["V12", "V17", "V10"], by = "outliers") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_creditcardfraud_ml_lof_plot_1.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_creditcardfraud_ml_lof_plot_1.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_creditcardfraud_ml_lof_plot_1.html + :file: SPHINX_DIRECTORY/figures/examples_creditcardfraud_ml_lof_plot_1.html We can catch outliers with a neighbors score. Again, the main problem with these sorts of algorithms is that what they have in precision, they lack in speed, which makes them unsuitable for scoring new data. This is why it's important to focus on scalable techniques like ``k-means``. diff --git a/docs/source/examples_business_football.rst b/docs/source/examples_business_football.rst index 5f7f97d45..0d5a224b7 100644 --- a/docs/source/examples_business_football.rst +++ b/docs/source/examples_business_football.rst @@ -46,14 +46,14 @@ Let's create a Virtual DataFrame of the dataset. .. ipython:: python :suppress: - football = vp.read_csv("/project/data/VerticaPy/docs/source/_static/website/examples/data/football/games.csv") + football = vp.read_csv("SPHINX_DIRECTORY/source/_static/website/examples/data/football/games.csv") res = football.head(5) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_table_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_table_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_table_head.html + :file: SPHINX_DIRECTORY/figures/examples_football_table_head.html Data Exploration and Preparation --------------------------------- @@ -68,12 +68,12 @@ Let's explore the data by displaying descriptive statistics of all the columns. :suppress: res = football["date"].describe() - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_describe.html + :file: SPHINX_DIRECTORY/figures/examples_football_describe.html The dataset includes a total of 41,586 games, which take place between 1872 and 2020. Let's look at our game types and teams. @@ -85,12 +85,12 @@ The dataset includes a total of 41,586 games, which take place between 1872 and :suppress: res = football["tournament"].describe() - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_describe_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_describe_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_describe_2.html + :file: SPHINX_DIRECTORY/figures/examples_football_describe_2.html Different types of tournaments took place (FIFA World Cup, UEFA Euro, etc.) aand most of the games in our data are friendlies or qualifiers for international tournaments. @@ -102,12 +102,12 @@ Different types of tournaments took place (FIFA World Cup, UEFA Euro, etc.) aand :suppress: res = football.describe() - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_describe_3.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_describe_3.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_describe_3.html + :file: SPHINX_DIRECTORY/figures/examples_football_describe_3.html .. code-block:: python @@ -117,12 +117,12 @@ Different types of tournaments took place (FIFA World Cup, UEFA Euro, etc.) aand :suppress: res = football.describe(method = "categorical") - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_describe_4.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_describe_4.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_describe_4.html + :file: SPHINX_DIRECTORY/figures/examples_football_describe_4.html The dataset includes 308 national teams. For most of the games, the home team scores better than the away team. Since some games take place in a neutral location, we can ensure this hypothesis using the variable 'neutral'. Notice also that the number of goals per match is pretty low (median of 1 for both away and home teams). @@ -204,12 +204,12 @@ Let's just consider teams that have played more than five home and away games. inplace = True, ) res = football - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_to_db_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_to_db_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_to_db_1.html + :file: SPHINX_DIRECTORY/figures/examples_football_to_db_1.html A lot of things could influence the outcome of a game. Since we only have access to the score, teams, and type of game, we can't consider external factors like, weather or temperature, which would otherwise help our prediction. @@ -257,12 +257,12 @@ Let's start by creating the feature 'winner' to indicate the winner of a game. football["home_score"] < football["away_score"], football["away_team"], None, ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_case_when_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_case_when_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_case_when_1.html + :file: SPHINX_DIRECTORY/figures/examples_football_case_when_1.html Let's analyze the last game of each tournament. @@ -289,12 +289,12 @@ Let's analyze the last game of each tournament. name = "order_tournament", ) res = football - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_analytic_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_analytic_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_analytic_2.html + :file: SPHINX_DIRECTORY/figures/examples_football_analytic_2.html We can filter the data by only considering the last games and top tournaments. @@ -337,12 +337,12 @@ We can filter the data by only considering the last games and top tournaments. ] ) res = football - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_filter_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_filter_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_filter_2.html + :file: SPHINX_DIRECTORY/figures/examples_football_filter_2.html Let's consider the World Cup as a special tournament. It is the only one where the confrontations between the top teams is possible. @@ -361,12 +361,12 @@ Let's consider the World Cup as a special tournament. It is the only one where t 1, 0, ) res = football["Word_Cup"] - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_decode_3.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_decode_3.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_decode_3.html + :file: SPHINX_DIRECTORY/figures/examples_football_decode_3.html We can compute all the number of cup-wins by team. As expected, Brazil and Germany are the top football teams. @@ -398,12 +398,12 @@ We can compute all the number of cup-wins by team. As expected, Brazil and Germa "nb_Continental_Cup": "desc", } ).head(10) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_groupby_3.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_groupby_3.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_groupby_3.html + :file: SPHINX_DIRECTORY/figures/examples_football_groupby_3.html Let's export the result to our Vertica database. @@ -430,12 +430,12 @@ Let's export the result to our Vertica database. relation_type = "table", ) res = football_cup_winners - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_to_db_4.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_to_db_4.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_to_db_4.html + :file: SPHINX_DIRECTORY/figures/examples_football_to_db_4.html Team Confederations ++++++++++++++++++++ @@ -459,7 +459,7 @@ First let's encode the different continents so we can compute the correct aggreg .. ipython:: python :suppress: - football = vp.read_csv("/project/data/VerticaPy/docs/source/_static/website/examples/data/football/games.csv") + football = vp.read_csv("SPHINX_DIRECTORY/source/_static/website/examples/data/football/games.csv") res = football.case_when( 'confederation', football["tournament"] == 'UEFA Euro qualification', 5, @@ -468,12 +468,12 @@ First let's encode the different continents so we can compute the correct aggreg football["tournament"] == 'Copa América', 2, football["tournament"] == 'Gold Cup', 1, 0, ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_table_confederation_case_when.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_table_confederation_case_when.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_table_confederation_case_when.html + :file: SPHINX_DIRECTORY/figures/examples_football_table_confederation_case_when.html We can aggregate the data and get each team's continent. @@ -493,12 +493,12 @@ We can aggregate the data and get each team's continent. [fun.max(football["confederation"])._as("confederation")], ) res = confederation.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_confederation_6.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_confederation_6.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_confederation_6.html + :file: SPHINX_DIRECTORY/figures/examples_football_confederation_6.html We can decode the previous label encoding. @@ -524,12 +524,12 @@ We can decode the previous label encoding. 1, "CONCACAF", "OFC", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_confederation_8.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_confederation_8.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_confederation_8.html + :file: SPHINX_DIRECTORY/figures/examples_football_confederation_8.html Let's export the result to our Vertica database. @@ -552,12 +552,12 @@ Let's export the result to our Vertica database. relation_type = "table", ) res = confederation - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_confederation_9.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_confederation_9.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_confederation_9.html + :file: SPHINX_DIRECTORY/figures/examples_football_confederation_9.html Team KPIs ++++++++++ @@ -608,12 +608,12 @@ We use just two variables to track teams: away_team and home_team. This makes it # Merging the 2 interverted datasets all_matchs = football.append(football2) res = all_matchs["neutral"].rename("home_team_id") - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_10.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_10.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_10.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_10.html To compute the different aggregations, we need to add dummies which indicate the type of game and winner. @@ -665,12 +665,12 @@ To compute the different aggregations, we need to add dummies which indicate the all_matchs["Victory_team1"].astype("int") all_matchs["Draw"] = (all_matchs["team1_score"] == all_matchs["team2_score"]) res = all_matchs["Draw"].astype("int") - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_11.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_11.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_11.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_11.html Now we can compute each team's KPI. @@ -712,12 +712,12 @@ Now we can compute each team's KPI. ], ).sort({"Number_Games_World_Tournament": "desc"}) res = teams_kpi.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_12.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_12.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_12.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_12.html We can join the different information about the cup winners to enrich our dataset. We'll be using this later, so let's export it to our Vertica database. @@ -749,12 +749,12 @@ We can join the different information about the cup winners to enrich our datase ], ).to_db("teams_kpi", relation_type = "table") res = teams_kpi.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_final.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_final.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_final.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_final.html Let's add each team's confederation to our dataset. @@ -778,12 +778,12 @@ Let's add each team's confederation to our dataset. expr2 = ["confederation"], ) res = teams_kpi.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_final_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_final_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_final_1.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_final_1.html Since clustering will use different statistics, we need to normalize the data. We'll also create a dummy that will equal 1 if the team won at least one World Cup. @@ -813,12 +813,12 @@ Since clustering will use different statistics, we need to normalize the data. W ) teams_kpi["Word_Cup_Victory"] = teams_kpi["nb_World_Cup"] > 0 res = teams_kpi["Word_Cup_Victory"].astype("int") - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_final_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_final_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_final_2.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_final_2.html Some data is missing; this is because only top teams won major tournaments. Besides, some non-professional teams may not have a stadium. @@ -830,12 +830,12 @@ Some data is missing; this is because only top teams won major tournaments. Besi :suppress: res = teams_kpi.count() - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_final_3.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_final_3.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_final_3.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_final_3.html Let's impute the missing values by 0. @@ -869,12 +869,12 @@ Let's impute the missing values by 0. "confederation": "OFC", }, ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_final_4.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_final_4.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_final_4.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_final_4.html Let's export the result to our Vertica database. @@ -897,12 +897,12 @@ Let's export the result to our Vertica database. inplace = True, ) res = teams_kpi - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_football_clustering_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_football_clustering_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_football_clustering_1.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_football_clustering_1.html Team Rankings with k-means --------------------------- @@ -952,10 +952,10 @@ To compute a ``k-means`` model, we need to find a value for 'k'. Let's draw an : predictors, n_cluster = (1, 11), ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_football_elbow_1.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_football_elbow_1.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_elbow_1.html + :file: SPHINX_DIRECTORY/figures/examples_football_elbow_1.html 6 seems to be a good number of clusters. To help the algorithm to converge to meaningful clusters, we can initialize the clusters with different types of centroid levels. For example, we can associate very good teams (champions) to World Cups Winners, good teams to continental Cup Winners, etc. This will let us to properly weigh the performance of each team relatve to the strength of their region. @@ -995,12 +995,12 @@ Let's add the prediction to the :py:mod:`~verticapy.vDataFrame`. teams_kpi, name = "fifa_rank", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_model_kmeans_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_model_kmeans_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_model_kmeans_1.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_model_kmeans_1.html Let's look at the strongest group, which includes well-known teams like Argentina, Brazil, and France. @@ -1020,12 +1020,12 @@ Let's look at the strongest group, which includes well-known teams like Argentin usecols = ["team1", "fifa_rank"], order_by = ["fifa_rank"], ).head(10) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_10.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_10.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_10.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_10.html The weakest group includes less well-known teams. @@ -1045,12 +1045,12 @@ The weakest group includes less well-known teams. usecols = ["team1", "fifa_rank"], order_by = ["fifa_rank"], ).head(10) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_11.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_11.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_11.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_11.html A bubble plot will let us visualize the differences in strength between each confederation. @@ -1081,10 +1081,10 @@ We can see the strongest group at the top right of the graphic and weakest teams size = "fifa_rank", by = "confederation", ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_football_scatter_1.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_football_scatter_1.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_scatter_1.html + :file: SPHINX_DIRECTORY/figures/examples_football_scatter_1.html We can also look at the Percent of Victory by rank to confirm our hypothesis. @@ -1111,10 +1111,10 @@ We can also look at the Percent of Victory by rank to confirm our hypothesis. size = "Percent_Victory", by = "fifa_rank", ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_football_scatter_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_football_scatter_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_scatter_2.html + :file: SPHINX_DIRECTORY/figures/examples_football_scatter_2.html A box plot can also show us the differences in skill between teams. We can look at rank 1, where the percent of victory is high because of the confederation. @@ -1129,10 +1129,10 @@ Note that the best team in a weaker confederation might not be particularly stro :okwarning: fig = teams_kpi["Percent_Victory"].boxplot(by = "fifa_rank") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_football_boxplot_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_football_boxplot_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_boxplot_2.html + :file: SPHINX_DIRECTORY/figures/examples_football_boxplot_2.html Let's export the KPIs to our Vertica database. @@ -1161,12 +1161,12 @@ Let's export the KPIs to our Vertica database. inplace = True, ) res = teams_kpi - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_13.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_13.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_13.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_13.html Features Engineering --------------------- @@ -1263,12 +1263,12 @@ We can add dumies to do aggregations on the different games. all_matchs["draw"].astype("int") all_matchs["victory_team2"] = all_matchs["team1_score"] < all_matchs["team2_score"] res = all_matchs["victory_team2"].astype("int") - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_15.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_15.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_15.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_15.html Let's use moving windows to compute some additional features. @@ -1399,12 +1399,12 @@ The teams' performance in their recent games order_by = ["date"], name = "avg_draw_team2_1_5", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_16.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_16.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_16.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_16.html The teams' performance in the last same tournament +++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -1533,12 +1533,12 @@ The teams' performance in the last same tournament order_by = ["date"], name = "avg_draw_same_tournament_team2_1_5", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_17.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_17.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_17.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_17.html Direct Confrontation +++++++++++++++++++++ @@ -1603,12 +1603,12 @@ Direct Confrontation order_by = ["date"], name = "avg_draw_direct_team1_1_5", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_19.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_19.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_19.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_19.html Games against an opponents with the same rank ++++++++++++++++++++++++++++++++++++++++++++++ @@ -1701,12 +1701,12 @@ Games against an opponents with the same rank order_by = ["date"], name = "avg_draw_rank1_team2_1_5", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_21.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_21.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_21.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_21.html Games between teams with rank 1 and rank 2 +++++++++++++++++++++++++++++++++++++++++++ @@ -1753,12 +1753,12 @@ Games between teams with rank 1 and rank 2 order_by = ["date"], name = "avg_draw_rank1_rank2_team1_1_5", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_22.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_22.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_22.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_22.html Before we use the 'neutral' variable with our model, we should convert it to an integer. @@ -1784,12 +1784,12 @@ We need also to create our response column: the outcome of the game. all_matchs["team1_score"] < all_matchs["team2_score"], "2", "X", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_23.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_23.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_kmeans_23.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_kmeans_23.html We have some missing values here. This might be because the two teams never played together, the competition was one or both teams' first, etc. @@ -1801,12 +1801,12 @@ We have some missing values here. This might be because the two teams never play :suppress: res = all_matchs.count() - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_count_final_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_count_final_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_count_final_1.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_count_final_1.html We need to impute these missing values. @@ -1858,12 +1858,12 @@ We need to impute these missing values. all_matchs["avg_victory_same_tournament_team2_1_10"].fillna(expr = "avg_victory_team2_1_10") all_matchs["avg_victory_same_tournament_team2_1_3"].fillna(expr = "avg_victory_team2_1_3") res = all_matchs["avg_draw_same_tournament_team2_1_5"].fillna(expr = "avg_draw_team2_1_5") - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_all_matchs_final_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_all_matchs_final_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_all_matchs_final_1.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_all_matchs_final_1.html Let's export the result to our Vertica database using the variable 'match_sample' to avoid counting the same game twice. @@ -1900,12 +1900,12 @@ Let's export the result to our Vertica database using the variable 'match_sample db_filter = (fun.year(all_matchs["date"]) > 2015) & (all_matchs["match_sample"] == 1), ) res = all_matchs - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_all_matchs_final_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_all_matchs_final_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_all_matchs_final_2.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_all_matchs_final_2.html Machine Learning ----------------- @@ -1955,12 +1955,12 @@ It's time to make predictions about the outcomes of games. We have a lot of vari :suppress: res = model.classification_report() - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_clean_kpi_ml_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_clean_kpi_ml_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_clean_kpi_ml_1.html + :file: SPHINX_DIRECTORY/figures/examples_football_clean_kpi_ml_1.html Our model is excellent! 57% of accuracy on 3 categories - it's almost twice as good as a random model. @@ -1978,10 +1978,10 @@ Looking at the importance of each feature, it seems like direct confrontations a :suppress: fig = model.features_importance() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_football_features_importance.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_football_features_importance.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_features_importance.html + :file: SPHINX_DIRECTORY/figures/examples_football_features_importance.html Let's add the predictions to the :py:mod:`~verticapy.vDataFrame`. @@ -2019,12 +2019,12 @@ Draws are pretty rare, so we'll only consider them if a tie was very likely to o test["prob_1"] > test["prob_2"], "1", test["prob_1"] < test["prob_2"], "2", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_ml_case_when_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_ml_case_when_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_ml_case_when_1.html + :file: SPHINX_DIRECTORY/figures/examples_football_ml_case_when_1.html Let's look at our predictions for the 2018 World Cup. @@ -2062,12 +2062,12 @@ Let's look at our predictions for the 2018 World Cup. ], order_by = ["date"], ).head(128) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_ml_search_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_ml_search_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_ml_search_1.html + :file: SPHINX_DIRECTORY/figures/examples_football_ml_search_1.html Fantastic: we built a very efficient model which predicted that France will win almost all of its games (except the game against Argentina which is really hard to predict). In reality, France did indeed win the 2018 World Cup! @@ -2111,12 +2111,12 @@ Fantastic: we built a very efficient model which predicted that France will win ], order_by = ["date"], ).head(128) - html_file = open("/project/data/VerticaPy/docs/figures/examples_football_ml_search_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_football_ml_search_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_football_ml_search_2.html + :file: SPHINX_DIRECTORY/figures/examples_football_ml_search_2.html Conclusion ----------- diff --git a/docs/source/examples_business_insurance.rst b/docs/source/examples_business_insurance.rst index 1c03ec117..cc4949166 100644 --- a/docs/source/examples_business_insurance.rst +++ b/docs/source/examples_business_insurance.rst @@ -58,16 +58,16 @@ Let's take a look at the first few entries in the dataset. vp.drop("insurance", method="schema") vp.create_schema("insurance") data = vp.read_csv( - "/project/data/VerticaPy/docs/source/_static/website/examples/data/insurance/insurance.csv", + "SPHINX_DIRECTORY/source/_static/website/examples/data/insurance/insurance.csv", schema = "insurance", ) res = data.head(5) - html_file = open("/project/data/VerticaPy/docs/figures/examples_insurance_table.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_insurance_table.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_table.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_table.html Data Exploration ----------------- @@ -82,12 +82,12 @@ Let's check our dataset for missing values. If we find any, we'll have to impute :suppress: res = data.count_percent() - html_file = open("/project/data/VerticaPy/docs/figures/examples_insurance_table_count.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_insurance_table_count.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_table_count.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_table_count.html There aren't missing any values, so let's get a summary of the features. @@ -99,12 +99,12 @@ There aren't missing any values, so let's get a summary of the features. :suppress: res = data.describe(method = "all") - html_file = open("/project/data/VerticaPy/docs/figures/examples_insurance_table_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_insurance_table_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_table_describe.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_table_describe.html The dataset covers 1338 individuals up to age 64 from four different regions, each with up to six dependent children. @@ -120,10 +120,10 @@ We might find some interesting patterns if we check age distribution, so let's c import verticapy verticapy.set_option("plotting_lib", "plotly") fig = data["age"].hist(method = "count", h = 1) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_insurance_hist_age.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_insurance_hist_age.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_hist_age.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_hist_age.html We have a pretty obvious trend here: the 18 and 19 year old age groups are significantly more frequent than any other, older age group. The other ages range from 20 to 30 people. @@ -138,12 +138,12 @@ Before we do anything else, let's discretize the age column using equal-width bi data["age"].discretize(method = "same_width", h = 5) res = data - html_file = open("/project/data/VerticaPy/docs/figures/examples_insurance_descretize.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_insurance_descretize.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_descretize.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_descretize.html Age probably influences one's body mass index (BMI), so let's compare the average of @@ -165,10 +165,10 @@ body mass indexes of each age group and look for patterns there. We'll use a bar method = "mean", of = "bmi", ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_insurance_bar_age.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_insurance_bar_age.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_bar_age.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_bar_age.html There's a pretty clear trend here, and we can say that, in general, older individuals tend to have a greater BMIs. @@ -201,10 +201,10 @@ Now we can plot the average number of smokers for each age group. method = "mean", of = "smoker_int", ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_insurance_bar_age_smoker.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_insurance_bar_age_smoker.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_bar_age_smoker.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_bar_age_smoker.html Unfortunately, there's no obvious relationship between age and smoking habits - none that we can find from this graph, anyway. @@ -228,10 +228,10 @@ Let's see if we can relate an individual's smoking habits with their sex. method = "mean", of = "smoker_int", ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_insurance_bar_sex_smoker.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_insurance_bar_sex_smoker.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_bar_sex_smoker.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_bar_sex_smoker.html Now we're getting somewhere! Looks like we have noticeably more male smokers than female ones. @@ -255,10 +255,10 @@ Let's see how an individual's BMI relates to their sex. method = "mean", of = "bmi", ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_insurance_bar_sex_bmi.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_insurance_bar_sex_bmi.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_bar_sex_bmi.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_bar_sex_bmi.html Males seem to have a slightly higher BMI, but it'd be hard to draw any conclusions from such a small difference. @@ -275,10 +275,10 @@ patterns we identified earlier skews toward one of the sexes. import verticapy verticapy.set_option("plotting_lib", "plotly") fig = data.pivot_table(["age", "sex"]) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_insurance_corr_age_sex.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_insurance_corr_age_sex.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_corr_age_sex.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_corr_age_sex.html It seems that sex is pretty evenly distributed in each age group. @@ -292,10 +292,10 @@ Let's move onto costs: how much do people tend to spend on medical treatments? :suppress: fig = data["charges"].hist(method = "count") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_insurance_charges_hist.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_insurance_charges_hist.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_charges_hist.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_charges_hist.html Based on this graph, the majority of insurance holders tend to spend less than 1500 and only a handful of people spend more than 5000. @@ -329,12 +329,12 @@ Remember, we label-encoded 'smoker' from boolean. Let's label-encode some other # encoding age data["age"].label_encode() res = data - html_file = open("/project/data/VerticaPy/docs/figures/examples_insurance_table_encoded_new.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_insurance_table_encoded_new.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_table_encoded_new.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_table_encoded_new.html Before going further, let's check the correlation of the variables with the predictor 'charges'. @@ -348,10 +348,10 @@ Before going further, let's check the correlation of the variables with the pred import verticapy verticapy.set_option("plotting_lib", "plotly") fig = data.corr(focus = "charges") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_insurance_charges_focus.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_insurance_charges_focus.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_charges_focus.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_charges_focus.html .. code-block:: python @@ -400,12 +400,12 @@ We can create a regression report to check our model's performance. :okwarning: res = rf_model.report() - html_file = open("/project/data/VerticaPy/docs/figures/examples_insurance_table_report.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_insurance_table_report.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_table_report.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_table_report.html The results seem to be quite good! We have an explained variance around 0.8. Let's plot the predicted values and compare them to the real ones. @@ -439,10 +439,10 @@ Let's plot the predicted values and compare them to the real ones. ts = "id", columns = ["charges", "pred_charges"] ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_insurance_rf_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_insurance_rf_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_rf_plot.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_rf_plot.html .. code-block:: python @@ -463,10 +463,10 @@ Ours is a random forest model, so we can use the built-in Vertica function ``RF_ verticapy.set_option("plotting_lib", "plotly") # feature importance for our random forest model fig = rf_model.features_importance() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_insurance_rf_feature_importance.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_insurance_rf_feature_importance.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_rf_feature_importance.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_rf_feature_importance.html .. code-block:: python @@ -480,12 +480,12 @@ Ours is a random forest model, so we can use the built-in Vertica function ``RF_ :suppress: res = rf_model.features_importance(show = False) - html_file = open("/project/data/VerticaPy/docs/figures/examples_insurance_table_feature_importance_rf.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_insurance_table_feature_importance_rf.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_table_feature_importance_rf.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_table_feature_importance_rf.html We can examine how our model works by visualizing one of the trees in our ``Random Forest``. @@ -549,12 +549,12 @@ information criterion (BIC) as a selection criteria. X = ["age","sex", "bmi", "children", "smoker", "region"], y = "charges", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_insurance_lr_stepwise.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_insurance_lr_stepwise.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_insurance_lr_stepwise.html + :file: SPHINX_DIRECTORY/figures/examples_insurance_lr_stepwise.html From here we see that, again, the same features have similarly significant effects on medical costs. diff --git a/docs/source/examples_business_movies.rst b/docs/source/examples_business_movies.rst index dae9b6da5..f304508be 100644 --- a/docs/source/examples_business_movies.rst +++ b/docs/source/examples_business_movies.rst @@ -59,14 +59,14 @@ Let's take a look at the first few entries in the dataset. vp.drop("movies", method="schema") vp.create_schema("movies") - filmtv_movies = vp.read_csv("/project/data/VerticaPy/docs/source/_static/website/examples/data/movies/movies.csv", schema = "movies") + filmtv_movies = vp.read_csv("SPHINX_DIRECTORY/source/_static/website/examples/data/movies/movies.csv", schema = "movies") res = filmtv_movies.head(5) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_table.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_table.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_table.html + :file: SPHINX_DIRECTORY/figures/examples_movies_table.html Data Exploration and Preparation --------------------------------- @@ -83,12 +83,12 @@ First, let's explore the dataset. :suppress: res = filmtv_movies.describe(method = "categorical", unique = True) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_describe_cat.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_describe_cat.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_describe_cat.html + :file: SPHINX_DIRECTORY/figures/examples_movies_describe_cat.html We can drop the 'description' and 'notes' columns since these fields are empty for most of our dataset. @@ -101,12 +101,12 @@ We can drop the 'description' and 'notes' columns since these fields are empty f filmtv_movies.drop(["description", "notes"]) res = filmtv_movies - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_drop.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_drop.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_drop.html + :file: SPHINX_DIRECTORY/figures/examples_movies_drop.html We have access to more than 50000 movies in 27 different genres. Let's organize our list by their average rating. @@ -119,12 +119,12 @@ We have access to more than 50000 movies in 27 different genres. Let's organize filmtv_movies.sort({"avg_vote" : "desc"}) res = filmtv_movies.sort({"avg_vote" : "desc"}) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_avg_vote_sort.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_avg_vote_sort.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_avg_vote_sort.html + :file: SPHINX_DIRECTORY/figures/examples_movies_avg_vote_sort.html Since we want properly averaged scores, let's just consider the top 10 movies that have at least 10 votes. @@ -142,12 +142,12 @@ Since we want properly averaged scores, let's just consider the top 10 movies th conditions = [filmtv_movies["votes"] > 10], order_by = {"avg_vote" : "desc" }, ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_search_votes.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_search_votes.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_search_votes.html + :file: SPHINX_DIRECTORY/figures/examples_movies_search_votes.html We can see classic movies like 'The Godfather' and 'Greed'. Let's smooth the avg_vote using a linear regression to make it more representative. @@ -176,7 +176,7 @@ We can extract the five main actors for each movie with regular expressions. :suppress: for i in range(1, 5): - filmtv_movies2 = vp.read_csv("/project/data/VerticaPy/docs/source/_static/website/examples/data/movies/movies.csv") + filmtv_movies2 = vp.read_csv("SPHINX_DIRECTORY/source/_static/website/examples/data/movies/movies.csv") filmtv_movies2.regexp( column = "actors", method = "substr", @@ -189,12 +189,12 @@ We can extract the five main actors for each movie with regular expressions. else: filmtv_movies = filmtv_movies.append(filmtv_movies2) res = filmtv_movies["actor"].describe() - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_describe_actors.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_describe_actors.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examplexamples_movies_describe_actorses_movies_search_votes.html + :file: SPHINX_DIRECTORY/figures/examplexamples_movies_describe_actorses_movies_search_votes.html By aggregating the data, we can find the number of actors and the number of votes by actor. We can then normalize the data using the min-max method and quantify the notoriety of the actors. @@ -227,12 +227,12 @@ We can then normalize the data using the min-max method and quantify the notorie ) actors_stats["actor"].dropna() res = actors_stats["notoriety_actors"].normalize(method = "minmax") - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_normalize_actors.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_normalize_actors.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_normalize_actors.html + :file: SPHINX_DIRECTORY/figures/examples_movies_normalize_actors.html Let's look at the top ten actors by notoriety. @@ -254,12 +254,12 @@ Let's look at the top ten actors by notoriety. "castings_actors" : "desc", }, ).head(10) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_actors_notr_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_actors_notr_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_actors_notr_head.html + :file: SPHINX_DIRECTORY/figures/examples_movies_actors_notr_head.html As expected, we get a list of very popular actors like Robert De Niro, Morgan Freeman, and Clint Eastwood. @@ -288,12 +288,12 @@ Let's do the same for the directors. ], ) res = director_stats["notoriety_director"].normalize(method = "minmax") - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_notoriety_director.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_notoriety_director.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_notoriety_director.html + :file: SPHINX_DIRECTORY/figures/examples_movies_notoriety_director.html Now let's look at the top 10 movie directors. @@ -315,12 +315,12 @@ Now let's look at the top 10 movie directors. "castings_director" : "desc", }, ).head(10) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_notoriety_director_head_order.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_notoriety_director_head_order.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_notoriety_director_head_order.html + :file: SPHINX_DIRECTORY/figures/examples_movies_notoriety_director_head_order.html Again, we get a list of popular directors like Steven Spielberg, Woody Allen, and Clint Eastwood. @@ -404,12 +404,12 @@ Let's compute some statistics on our dataset. :suppress: res = filmtv_movies_complete.describe(method = "all") - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_describe.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_describe.html We can use the movie's release year to get create three categories. @@ -429,12 +429,12 @@ We can use the movie's release year to get create three categories. filmtv_movies_complete["year"] < 1990, "Old", filmtv_movies_complete["year"] >= 2000, "Recent", "90s", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_casewhen.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_casewhen.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_casewhen.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_casewhen.html Now, let's look at the countries that made the most movies. @@ -452,12 +452,12 @@ Now, let's look at the countries that made the most movies. columns = ["country"], expr = ["COUNT(*)"], ).sort({"count" : "desc"}).head(10) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_country_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_country_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_country_head.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_country_head.html We can use this variable to create language groups. @@ -537,12 +537,12 @@ We can use this variable to create language groups. vp.StringSQL("REGEXP_LIKE(Country, '{}')".format("|".join(Russian_Est_Europe))), 'Russian_Est_Europe', vp.StringSQL("REGEXP_LIKE(Country, '{}')".format("|".join(Grec_Balkan))), 'Grec_Balkan', 'Others') - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_complete_language.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_complete_language.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_complete_language.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_complete_language.html We can do the same for the genres. @@ -578,12 +578,12 @@ We can do the same for the genres. vp.StringSQL("REGEXP_LIKE(Genre, 'Horror')"), 'Horror', 'Others' ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_complete_category_genre.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_complete_category_genre.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_complete_category_genre.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_complete_category_genre.html Since we're more concerned with the 'Category' at this point, we can drop 'genre.' @@ -606,12 +606,12 @@ Let's look at the missing values. :suppress: res = filmtv_movies_complete.count_percent() - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_complete_missing_vals.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_complete_missing_vals.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_complete_missing_vals.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_complete_missing_vals.html Let's impute the missing values for 'notoriety_actors' and 'castings_actors' using different techniques. We can then drop the few remaining missing values. @@ -653,12 +653,12 @@ We can then drop the few remaining missing values. ) filmtv_movies_complete.dropna() res = filmtv_movies_complete - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_complete_after_drop.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_complete_after_drop.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_complete_after_drop.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_complete_after_drop.html Before we export the data, we should normalize the numerical columns to get the dummies of the different categories. @@ -747,12 +747,12 @@ Let's create a model to evaluate an unbiased score for each different movie. :okwarning: res = model.report() - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_complete_model_report.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_complete_model_report.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_complete_model_report.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_complete_model_report.html The model is good. Let's add it in our :py:mod:`~verticapy.vDataFrame`. @@ -771,12 +771,12 @@ The model is good. Let's add it in our :py:mod:`~verticapy.vDataFrame`. filmtv_movies_complete, name = "unbiased_vote", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_complete_model_predict.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_complete_model_predict.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_complete_model_predict.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_complete_model_predict.html Since a score can't be greater than 10 or less than 0, we need to adjust the 'unbiased_vote'. @@ -842,12 +842,12 @@ Let's look at the top movies. "avg_vote" : "desc", }, ).head(10) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_top_movie_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_top_movie_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_top_movie_head.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_top_movie_head.html Great, our results are more consistent. Psycho, Pulp Fiction, and The Godfather are among the top movies. @@ -864,12 +864,12 @@ Since ``k-means`` clustering is sensitive to unnormalized data, let's normalize :suppress: res = filmtv_movies_complete["unbiased_vote"].normalize(method = "minmax") - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_normalize_minmax.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_normalize_minmax.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_normalize_minmax.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_normalize_minmax.html Let's compute the :py:func:`~verticapy.machine_learning.model_selection.elbow` curve to find a suitable number of clusters. @@ -911,10 +911,10 @@ Let's compute the :py:func:`~verticapy.machine_learning.model_selection.elbow` c verticapy.set_option("plotting_lib", "plotly") fig = elbow_chart - fig.write_html("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_elbow_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_movies_filmtv_elbow_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_elbow_plot.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_elbow_plot.html By looking at the elbow curve, we can choose 15 clusters. Let's create a ``k-means`` model. @@ -943,12 +943,12 @@ Let's add the clusters in the :py:mod:`~verticapy.vDataFrame`. filmtv_movies_complete, name = "movies_cluster", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_movie_cluster_predict.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_movie_cluster_predict.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_movie_cluster_predict.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_movie_cluster_predict.html Let's look at the different clusters. @@ -984,12 +984,12 @@ Let's look at the different clusters. "Category", ], ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_movie_cluster_0_search.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_movie_cluster_0_search.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_movie_cluster_0_search.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_movie_cluster_0_search.html .. code-block:: python @@ -1023,12 +1023,12 @@ Let's look at the different clusters. "Category", ], ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_movie_cluster_1_search.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_movie_cluster_1_search.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_movie_cluster_1_search.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_movie_cluster_1_search.html .. code-block:: python @@ -1062,12 +1062,12 @@ Let's look at the different clusters. "Category", ], ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_movie_cluster_2_search.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_movie_cluster_2_search.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_movie_cluster_2_search.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_movie_cluster_2_search.html .. code-block:: python @@ -1101,12 +1101,12 @@ Let's look at the different clusters. "Category", ], ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_movies_filmtv_movie_cluster_3_search.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_movies_filmtv_movie_cluster_3_search.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_movies_filmtv_movie_cluster_3_search.html + :file: SPHINX_DIRECTORY/figures/examples_movies_filmtv_movie_cluster_3_search.html Each cluster consists of similar movies. These clusters can be used to give movie recommendations or help streaming platforms group movies together. diff --git a/docs/source/examples_business_smart_meters.rst b/docs/source/examples_business_smart_meters.rst index ae3e3b675..ec0615b43 100644 --- a/docs/source/examples_business_smart_meters.rst +++ b/docs/source/examples_business_smart_meters.rst @@ -76,7 +76,7 @@ Create the :py:mod:`~verticapy.vDataFrame` of the datasets: :suppress: sm_consumption = vp.read_csv( - "/project/data/VerticaPy/docs/source/_static/website/examples/data/smart_meters/sm_consumption.csv", + "SPHINX_DIRECTORY/source/_static/website/examples/data/smart_meters/sm_consumption.csv", dtype = { "meterID": "Integer", "dateUTC": "Timestamp(6)", @@ -84,21 +84,21 @@ Create the :py:mod:`~verticapy.vDataFrame` of the datasets: } ) sm_weather = vp.read_csv( - "/project/data/VerticaPy/docs/source/_static/website/examples/data/smart_meters/sm_weather.csv", + "SPHINX_DIRECTORY/source/_static/website/examples/data/smart_meters/sm_weather.csv", dtype = { "dateUTC": "Timestamp(6)", "temperature": "Float(22)", "humidity": "Float(22)", } ) - sm_meters = vp.read_csv("/project/data/VerticaPy/docs/source/_static/website/examples/data/smart_meters/sm_meters.csv") + sm_meters = vp.read_csv("SPHINX_DIRECTORY/source/_static/website/examples/data/smart_meters/sm_meters.csv") res = sm_consumption.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_consumption_table_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_consumption_table_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_table_head.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_table_head.html .. code-block:: python @@ -108,12 +108,12 @@ Create the :py:mod:`~verticapy.vDataFrame` of the datasets: :suppress: res = sm_weather.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_weather_table_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_weather_table_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_weather_table_head.html + :file: SPHINX_DIRECTORY/figures/examples_sm_weather_table_head.html .. code-block:: python @@ -123,12 +123,12 @@ Create the :py:mod:`~verticapy.vDataFrame` of the datasets: :suppress: res = sm_weather.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_meters_table_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_meters_table_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_meters_table_head.html + :file: SPHINX_DIRECTORY/figures/examples_sm_meters_table_head.html Data Exploration and Preparation --------------------------------- @@ -164,12 +164,12 @@ In VerticaPy, you can interpolate joins; Vertica will find the closest timestamp expr2 = ["humidity", "temperature"], ) res = sm_consumption_weather.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_consumption_weather_table.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_consumption_weather_table.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_weather_table.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_weather_table.html Segmenting Latitude & Longitude using Clustering +++++++++++++++++++++++++++++++++++++++++++++++++ @@ -184,12 +184,12 @@ The dataset 'sm_meters' is pretty important. In particular, the type of residenc :suppress: res = sm_meters.agg(["min", "max"]) - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_meters_agg_table.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_meters_agg_table.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_meters_agg_table.html + :file: SPHINX_DIRECTORY/figures/examples_sm_meters_agg_table.html .. ipython:: python :okwarning: @@ -234,10 +234,10 @@ Based on the scatter plot, five seems like the optimal number of clusters. Let's import verticapy verticapy.set_option("plotting_lib", "plotly") fig = elbow(sm_meters, ["longitude", "latitude"], n_cluster = (3, 8)) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_sm_meters_elbow_1.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_sm_meters_elbow_1.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_meters_elbow_1.html + :file: SPHINX_DIRECTORY/figures/examples_sm_meters_elbow_1.html The elbow curve seems to confirm that five is the optimal number of clusters, so let's create a ``k-means`` model with that in mind. @@ -326,12 +326,12 @@ Let's join 'sm_meters' with 'sm_consumption_weather'. ], ) res = sm_consumption_weather_region.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_consumption_weather_region_table.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_consumption_weather_region_table.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_weather_region_table.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_weather_region_table.html Handling Missing Values ++++++++++++++++++++++++ @@ -346,12 +346,12 @@ Let's take care of our missing values. :suppress: res = sm_consumption_weather_region.count_percent() - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_consumption_weather_region_count_percent_table.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_consumption_weather_region_count_percent_table.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_weather_region_count_percent_table.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_weather_region_count_percent_table.html The variable 'value' has a few missing values that we can drop. @@ -365,12 +365,12 @@ The variable 'value' has a few missing values that we can drop. sm_consumption_weather_region["value"].dropna() res = sm_consumption_weather_region.count() - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_consumption_weather_region_count_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_consumption_weather_region_count_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_weather_region_count_2.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_weather_region_count_2.html Interpolation & Aggregations +++++++++++++++++++++++++++++ @@ -422,12 +422,12 @@ To get an equally-sliced dataset, we can then interpolate to fill any gaps. This by = ["meterID"], ) res = sm_consumption_weather_region_clean.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_consumption_weather_region_clean_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_consumption_weather_region_clean_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_weather_region_clean_1.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_weather_region_clean_1.html Let's aggregate the data to figure out the monthly energy consumption for each smart meter. We can then save the result in the Vertica database. @@ -489,12 +489,12 @@ Let's aggregate the data to figure out the monthly energy consumption for each s relation_type = "table", inplace = True, ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_consumption_month_clean_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_consumption_month_clean_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_month_clean_2.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_month_clean_2.html Understanding the Data & Detecting Outliers ++++++++++++++++++++++++++++++++++++++++++++ @@ -515,10 +515,10 @@ Looking at three different smart meters, we can see a clear decrease in energy c import verticapy verticapy.set_option("plotting_lib", "plotly") fig = sm_consumption_month[sm_consumption_month["meterID"] == 10]["value"].plot(ts = "date_month") - fig.write_html("/project/data/VerticaPy/docs/figures/sm_consumption_month_plot_10.html") + fig.write_html("SPHINX_DIRECTORY/figures/sm_consumption_month_plot_10.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/sm_consumption_month_plot_10.html + :file: SPHINX_DIRECTORY/figures/sm_consumption_month_plot_10.html .. code-block:: python @@ -529,10 +529,10 @@ Looking at three different smart meters, we can see a clear decrease in energy c :okwarning: fig = sm_consumption_month[sm_consumption_month["meterID"] == 12]["value"].plot(ts = "date_month") - fig.write_html("/project/data/VerticaPy/docs/figures/sm_consumption_month_plot_12.html") + fig.write_html("SPHINX_DIRECTORY/figures/sm_consumption_month_plot_12.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/sm_consumption_month_plot_12.html + :file: SPHINX_DIRECTORY/figures/sm_consumption_month_plot_12.html .. code-block:: python @@ -543,10 +543,10 @@ Looking at three different smart meters, we can see a clear decrease in energy c :okwarning: fig = sm_consumption_month[sm_consumption_month["meterID"] == 14]["value"].plot(ts = "date_month") - fig.write_html("/project/data/VerticaPy/docs/figures/sm_consumption_month_plot_14.html") + fig.write_html("SPHINX_DIRECTORY/figures/sm_consumption_month_plot_14.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/sm_consumption_month_plot_14.html + :file: SPHINX_DIRECTORY/figures/sm_consumption_month_plot_14.html This behavior seems to be seasonal, but we don't have enough data to prove this. @@ -567,12 +567,12 @@ Let's find outliers in the distribution by computing the ZSCORE per meterID. avg = fun.avg(sm_consumption_month["value"])._over(by = [sm_consumption_month["meterID"]]) sm_consumption_month["value_zscore"] = (sm_consumption_month["value"] - avg) / std res = sm_consumption_month.search("value_zscore > 4") - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_consumption_value_zscore_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_consumption_value_zscore_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_value_zscore_1.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_value_zscore_1.html Four smart meters are outliers in energy consumption. We'll need to investigate to get more information. @@ -585,10 +585,10 @@ Four smart meters are outliers in energy consumption. We'll need to investigate :okwarning: fig = sm_consumption_month[sm_consumption_month["meterID"] == 364]["value"].plot(ts = "date_month") - fig.write_html("/project/data/VerticaPy/docs/figures/sm_consumption_month_plot_1_364.html") + fig.write_html("SPHINX_DIRECTORY/figures/sm_consumption_month_plot_1_364.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/sm_consumption_month_plot_1_364.html + :file: SPHINX_DIRECTORY/figures/sm_consumption_month_plot_1_364.html .. code-block:: python @@ -599,10 +599,10 @@ Four smart meters are outliers in energy consumption. We'll need to investigate :okwarning: fig = sm_consumption_month[sm_consumption_month["meterID"] == 399]["value"].plot(ts = "date_month") - fig.write_html("/project/data/VerticaPy/docs/figures/sm_consumption_month_plot_1_399.html") + fig.write_html("SPHINX_DIRECTORY/figures/sm_consumption_month_plot_1_399.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/sm_consumption_month_plot_1_399.html + :file: SPHINX_DIRECTORY/figures/sm_consumption_month_plot_1_399.html .. code-block:: python @@ -613,10 +613,10 @@ Four smart meters are outliers in energy consumption. We'll need to investigate :okwarning: fig = sm_consumption_month[sm_consumption_month["meterID"] == 809]["value"].plot(ts = "date_month") - fig.write_html("/project/data/VerticaPy/docs/figures/sm_consumption_month_plot_1_809.html") + fig.write_html("SPHINX_DIRECTORY/figures/sm_consumption_month_plot_1_809.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/sm_consumption_month_plot_1_809.html + :file: SPHINX_DIRECTORY/figures/sm_consumption_month_plot_1_809.html .. code-block:: python @@ -627,10 +627,10 @@ Four smart meters are outliers in energy consumption. We'll need to investigate :okwarning: fig = sm_consumption_month[sm_consumption_month["meterID"] == 951]["value"].plot(ts = "date_month") - fig.write_html("/project/data/VerticaPy/docs/figures/sm_consumption_month_plot_1_951.html") + fig.write_html("SPHINX_DIRECTORY/figures/sm_consumption_month_plot_1_951.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/sm_consumption_month_plot_1_951.html + :file: SPHINX_DIRECTORY/figures/sm_consumption_month_plot_1_951.html Data Encoding & Bivariate Analysis +++++++++++++++++++++++++++++++++++ @@ -655,12 +655,12 @@ Since most of our data is categorical, let's encode them with One-hot encoding. max_cardinality = 20, ) res = sm_consumption_month.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_consumption_month_clean_4.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_consumption_month_clean_4.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_month_clean_4.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_month_clean_4.html Let's compute the Pearson correlation matrix. @@ -672,10 +672,10 @@ Let's compute the Pearson correlation matrix. :suppress: fig = sm_consumption_month.corr() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_sm_consumption_month_corr_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_sm_consumption_month_corr_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_month_corr_2.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_month_corr_2.html There's a clear correlation between the month and energy consumption, but this isn't causal. Instead, we can think of the weather as having the direct influence on energy consumption. To accomodate for this view, we'll use the temperature as a predictor (rather than the month). @@ -689,10 +689,10 @@ There's a clear correlation between the month and energy consumption, but this i import verticapy verticapy.set_option("plotting_lib", "plotly") fig = sm_consumption_month.corr(focus = "value") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_sm_consumption_month_corr_3.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_sm_consumption_month_corr_3.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_month_corr_3.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_month_corr_3.html Global Behavior ++++++++++++++++ @@ -725,10 +725,10 @@ Let's look at this globally. ], ) fig = sm_consumption_final.plot(ts = "date_month", columns = ["avg_value"]) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_sm_consumption_final_7.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_sm_consumption_final_7.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_final_7.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_final_7.html We expect to see a fall in energy consumption during summer and then an increase during the winter. A simple prediction could use the average value a year before. @@ -750,10 +750,10 @@ We expect to see a fall in energy consumption during summer and then an increase fun.lag(sm_consumption_final["avg_value"], 12)._over(order_by = ["date_month"]), ) fig = sm_consumption_final.plot(ts = "date_month", columns = ["prediction", "avg_value"]) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_sm_consumption_final_8.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_sm_consumption_final_8.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_final_8.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_final_8.html .. ipython:: python @@ -793,12 +793,12 @@ Let's create our model. import verticapy verticapy.set_option("plotting_lib", "plotly") res = model.report("details") - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_consumption_model_report_9.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_consumption_model_report_9.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_model_report_9.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_model_report_9.html The model seems to be good with an adjusted R2 of 77.5%, and the F-Statistic indicates that at least one of the two predictors is useful. Let's look at the residual plot. @@ -820,10 +820,10 @@ The model seems to be good with an adjusted R2 of 77.5%, and the F-Statistic ind ) sm_consumption_final["residual"] = sm_consumption_final["avg_value"] - sm_consumption_final["value_prediction"] fig = sm_consumption_final.scatter(["avg_value", "residual"]) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_sm_consumption_final_1.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_sm_consumption_final_1.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_final_1.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_final_1.html Looking at the residual plot, we can see that the error variance varies by quite a bit. A possible suspect might be heteroscedasticity. Let's verify our hypothesis using a Breusch-Pagan test. @@ -846,12 +846,12 @@ Let's look at the entire regression report. import verticapy verticapy.set_option("plotting_lib", "plotly") res = model.report() - html_file = open("/project/data/VerticaPy/docs/figures/examples_sm_consumption_model_report_10.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_sm_consumption_model_report_10.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_sm_consumption_model_report_10.html + :file: SPHINX_DIRECTORY/figures/examples_sm_consumption_model_report_10.html Our model is very good; its median absolute error is around 13kWh. With this model, we can make predictions about the energy consumption of households per region. If the usage exceeds what the model predicts, we can raise an alert and respond, for example, by regulating the electricity distributed to the region. diff --git a/docs/source/examples_business_spam.rst b/docs/source/examples_business_spam.rst index 7990cadc4..2caf63ac7 100644 --- a/docs/source/examples_business_spam.rst +++ b/docs/source/examples_business_spam.rst @@ -45,15 +45,15 @@ Let's take a look at the first few entries in the dataset. :suppress: spam = vp.read_csv( - "/project/data/VerticaPy/docs/source/_static/website/examples/data/spam/spam.csv", + "SPHINX_DIRECTORY/source/_static/website/examples/data/spam/spam.csv", ) res = spam.head(10) - html_file = open("/project/data/VerticaPy/docs/figures/examples_spam_table.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_spam_table.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spam_table.html + :file: SPHINX_DIRECTORY/figures/examples_spam_table.html Data Exploration and Preparation --------------------------------- @@ -76,12 +76,12 @@ Our dataset relies on text analysis. First, we should create some features. For spam["length"] = fun.length(spam["content"]) spam["content"].apply("LOWER({})") res = spam["type"].decode('spam', 1, 0) - html_file = open("/project/data/VerticaPy/docs/figures/examples_spam_table_clean.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_spam_table_clean.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spam_table_clean.html + :file: SPHINX_DIRECTORY/figures/examples_spam_table_clean.html Let's compute some statistics using the length of the message. @@ -99,12 +99,12 @@ Let's compute some statistics using the length of the message. method = 'cat_stats', numcol = 'length', ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_spam_table_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_spam_table_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spam_table_describe.html + :file: SPHINX_DIRECTORY/figures/examples_spam_table_describe.html .. note:: Spam tends to be longer than a normal message. First, let's create a view with just spam. Then, we'll use the :py:mod:`~verticapy.machine_learning.vertica.CountVectorizer` to create a dictionary and identify keywords. @@ -131,12 +131,12 @@ Let's compute some statistics using the length of the message. dict_spams.fit(spams, ["content"]) dict_spams = dict_spams.transform() res = dict_spams - html_file = open("/project/data/VerticaPy/docs/figures/examples_spam_table_clean_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_spam_table_clean_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spam_table_clean_2.html + :file: SPHINX_DIRECTORY/figures/examples_spam_table_clean_2.html Let's add the most occurent words in our :py:mod:`~verticapy.vDataFrame` and compute the correlation vector. @@ -167,10 +167,10 @@ Let's add the most occurent words in our :py:mod:`~verticapy.vDataFrame` and com column = "content", ) fig = spam.corr(focus = "type") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spam_corr.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spam_corr.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spam_corr.html + :file: SPHINX_DIRECTORY/figures/examples_spam_corr.html Let's just keep the first 100-most correlated features and merge the numbers together. @@ -205,12 +205,12 @@ Let's just keep the first 100-most correlated features and merge the numbers tog method = "count", name = "nb_numbers", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_spam_table_regexp.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_spam_table_regexp.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spam_table_regexp.html + :file: SPHINX_DIRECTORY/figures/examples_spam_table_regexp.html Let's narrow down our keyword list to words of more than two characters. @@ -241,10 +241,10 @@ Compute the correlation vector again using the response column. import verticapy verticapy.set_option("plotting_lib", "plotly") fig = spam.corr(focus = "type") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spam_corr_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spam_corr_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spam_corr_2.html + :file: SPHINX_DIRECTORY/figures/examples_spam_corr_2.html We have enough correlated features with our response to create a fantastic model. @@ -288,12 +288,12 @@ The ``Naive Bayes`` classifier is a powerful and performant algorithm for text a "type", cv = 5, ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_spam_table_report.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_spam_table_report.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spam_table_report.html + :file: SPHINX_DIRECTORY/figures/examples_spam_table_report.html We have an excellent model! Let's learn from the data. diff --git a/docs/source/examples_business_spotify.rst b/docs/source/examples_business_spotify.rst index a0462b592..1543f27e6 100644 --- a/docs/source/examples_business_spotify.rst +++ b/docs/source/examples_business_spotify.rst @@ -103,12 +103,12 @@ Load the datasets into the :py:mod:`~verticapy.vDataFrame` with :py:func:`~verti :suppress: artists = vp.read_csv( - "/project/data/VerticaPy/docs/source/_static/website/examples/data/spotify/artists.csv", + "SPHINX_DIRECTORY/source/_static/website/examples/data/spotify/artists.csv", schema = "spotify", parse_nrows = 100, ) res = artists.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_spotify_artists_table.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_spotify_artists_table.html", "w") html_file.write(res._repr_html_()) html_file.close() @@ -117,19 +117,19 @@ Load the datasets into the :py:mod:`~verticapy.vDataFrame` with :py:func:`~verti tracks.head(100) .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_artists_table.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_artists_table.html .. ipython:: python :suppress: - tracks = vp.read_csv("/project/data/VerticaPy/docs/source/_static/website/examples/data/spotify/tracks.csv",schema = "spotify",parse_nrows = 100) + tracks = vp.read_csv("SPHINX_DIRECTORY/source/_static/website/examples/data/spotify/tracks.csv",schema = "spotify",parse_nrows = 100) res = tracks.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_spotify_tracks_table.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_spotify_tracks_table.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_tracks_table.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_tracks_table.html .. warning:: @@ -179,10 +179,10 @@ We can visualize the top 60 most-followed Polish artists with a bar chart. max_cardinality = 50, width = 800, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spotify_polish_followers_bar.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spotify_polish_followers_bar.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_polish_followers_bar.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_polish_followers_bar.html We can do the same with the most popular tracks. For example, we can graph Monika Brodka's most popular tracks like so: @@ -212,10 +212,10 @@ We can do the same with the most popular tracks. For example, we can graph Monik max_cardinality = 25, width = 800, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spotify_brodka_popularity_bar.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spotify_brodka_popularity_bar.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_brodka_popularity_bar.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_brodka_popularity_bar.html To get an idea of what makes Monika Brodka's songs popular, let's create a boxplot of the numerical feature distribution of her tracks. @@ -252,10 +252,10 @@ To get an idea of what makes Monika Brodka's songs popular, let's create a boxpl # create a boxplot of the above features fig = brodka_tracks.boxplot(columns = numerical_features) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spotify_boxplot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spotify_boxplot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_boxplot.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_boxplot.html Timing is a classic factor for success, so let's look at the popularity of Monika's songs over time with a smooth curve. @@ -294,10 +294,10 @@ Timing is a classic factor for success, so let's look at the popularity of Monik # plot the smoothed curve for popularity of her songs fig = brodka_tracks.plot(ts = "release_date", columns = ["smoothed_popularity"]) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spotify_brodka_release_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spotify_brodka_release_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_brodka_release_plot.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_brodka_release_plot.html Numerical-feature Analysis --------------------------- @@ -350,10 +350,10 @@ features change and correlate with each other in Monika's most popular songs. ts = "release_year", columns = numerical_features, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spotify_brodka_release_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spotify_brodka_release_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_brodka_release_plot.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_brodka_release_plot.html .. code-block:: @@ -365,10 +365,10 @@ features change and correlate with each other in Monika's most popular songs. :okwarning: fig = tracks[tracks[numerical_features]].corr() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spotify_tracks_corr.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spotify_tracks_corr.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_tracks_corr.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_tracks_corr.html Feature Engineering -------------------- @@ -455,12 +455,12 @@ Additionally, we manipulate our data a bit to make things easier later on: ) polish_tracks["nb_singers"].add(1) res = polish_tracks - html_file = open("/project/data/VerticaPy/docs/figures/examples_spotify_polish_tracks_clean_table.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_spotify_polish_tracks_clean_table.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_polish_tracks_clean_table.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_polish_tracks_clean_table.html Define a list of predictors and the response, and then save the normalized version of the final dataset to the database. @@ -564,10 +564,10 @@ Train the model. :okwarning: fig = auto_model.plot() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spotify_automl_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spotify_automl_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_automl_plot.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_automl_plot.html Extract the best model according to :py:mod:`~verticapy.machine_learning.vertica.automl.AutoML`. From here, we can look at the model type and its hyperparameters. @@ -621,12 +621,12 @@ Thanks to :py:mod:`~verticapy.machine_learning.vertica.automl.AutoML`, we know b polish_tracks, name = "estimated_popularity", ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_spotify_lr_prediction.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_spotify_lr_prediction.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_lr_prediction.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_lr_prediction.html View the regression report and the importance of each feature. @@ -639,12 +639,12 @@ View the regression report and the importance of each feature. :okwarning: res = rf_model.regression_report() - html_file = open("/project/data/VerticaPy/docs/figures/examples_spotify_lr_report.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_spotify_lr_report.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_lr_report.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_lr_report.html .. code-block:: @@ -655,10 +655,10 @@ View the regression report and the importance of each feature. :okwarning: fig = rf_model.features_importance() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spotify_lr_featrures.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spotify_lr_featrures.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_lr_featrures.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_lr_featrures.html To see how our model performs, let's plot the popularity and estimated popularity of songs by other Polish artists like Brodka and Akcent. @@ -692,10 +692,10 @@ To see how our model performs, let's plot the popularity and estimated popularit ts = "name", columns = ["popularity", "estimated_popularity"], ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spotify_lr_brodaka_predict_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spotify_lr_brodaka_predict_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_lr_brodaka_predict_plot.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_lr_brodaka_predict_plot.html .. code-block:: @@ -733,10 +733,10 @@ To see how our model performs, let's plot the popularity and estimated popularit "estimated_popularity", ], ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spotify_lr_akcent_predict_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spotify_lr_akcent_predict_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_lr_akcent_predict_plot.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_lr_akcent_predict_plot.html Group Artists using Track Features ------------------------------------ @@ -790,12 +790,12 @@ Let's start by taking the averages of these numerical features for each artist. # save relation to the database as "artists_features" artists_features.to_db('"spotify"."artists_features"') res = artists_features - html_file = open("/project/data/VerticaPy/docs/figures/examples_spotify_artists_features.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_spotify_artists_features.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_artists_features.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_artists_features.html Grouping means clustering, so we use an :py:func:`~verticapy.machine_learning.model_selection.elbow` curve to find a suitable number of clusters. @@ -832,10 +832,10 @@ Grouping means clustering, so we use an :py:func:`~verticapy.machine_learning.mo :okwarning: fig = elbow_curve - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spotify_lr_elbow.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spotify_lr_elbow.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_lr_elbow.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_lr_elbow.html Let's define and use the Vertica ``k-means`` algorithm to create a model that can group artists together. @@ -871,10 +871,10 @@ Plot the result of the k-means algoritm: :okwarning: fig = model.plot() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_spotify_cluster_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_spotify_cluster_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_cluster_plot.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_cluster_plot.html .. ipython:: python @@ -905,12 +905,12 @@ Let's see how our model groups these artists together: :okwarning: res = pred_genres["artists", "pred_genres"].sort({"pred_genres": "desc"}) - html_file = open("/project/data/VerticaPy/docs/figures/examples_spotify_pred_genres.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_spotify_pred_genres.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_spotify_pred_genres.html + :file: SPHINX_DIRECTORY/figures/examples_spotify_pred_genres.html Conclusion ----------- diff --git a/docs/source/examples_learn_commodities.rst b/docs/source/examples_learn_commodities.rst index 47a881a8e..f9b17077c 100644 --- a/docs/source/examples_learn_commodities.rst +++ b/docs/source/examples_learn_commodities.rst @@ -50,12 +50,12 @@ Let's create a Virtual DataFrame of the dataset. from verticapy.datasets import load_commodities commodities = load_commodities() res = commodities.head(100) - html_file = open("/project/data/VerticaPy/docs/figures/examples_commodities_table_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_commodities_table_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_head.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_head.html Data Exploration and Preparation --------------------------------- @@ -71,12 +71,12 @@ Let's explore the data by displaying descriptive statistics of all the columns. :okwarning: res = commodities.describe(method = "all", unique = True) - html_file = open("/project/data/VerticaPy/docs/figures/examples_commodities_table_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_commodities_table_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_describe.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_describe.html We have data from January 1986 to the beginning of August 2020. We don't have any missing values, so our data is already clean. @@ -93,10 +93,10 @@ Let's draw the different variables. import verticapy verticapy.set_option("plotting_lib", "plotly") fig = commodities.plot(ts = "date") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_commodities_table_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_commodities_table_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_plot.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_plot.html Some of the commodities have an upward monotonic trend and some others might be stationary. Let's use Augmented Dickey-Fuller tests to check our hypotheses. @@ -139,12 +139,12 @@ Some of the commodities have an upward monotonic trend and some others might be fuller[commodity] = result["value"] fuller = TableSample(fuller) res = fuller - html_file = open("/project/data/VerticaPy/docs/figures/examples_commodities_table_adfuller.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_commodities_table_adfuller.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_adfuller.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_adfuller.html As expected: The price of gold and the S&P 500 index are not stationary. Let's use the Mann-Kendall test to confirm the trends. @@ -181,12 +181,12 @@ As expected: The price of gold and the S&P 500 index are not stationary. Let's u kendall[commodity] = result["value"] kendall = TableSample(kendall) res = kendall - html_file = open("/project/data/VerticaPy/docs/figures/examples_commodities_table_kendall.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_commodities_table_kendall.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_kendall.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_kendall.html Our hypothesis is correct. We can also look at the correlation between the elapsed time and our variables to see the different trends. @@ -205,10 +205,10 @@ Our hypothesis is correct. We can also look at the correlation between the elaps commodities["elapsed_days"] = commodities["date"] - fun.min(commodities["date"])._over() fig = commodities.corr(focus = "elapsed_days") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_commodities_table_corr_1.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_commodities_table_corr_1.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_corr_1.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_corr_1.html In the last plot, it's a bit hard to tell if 'Spread' is stationary. Let's draw it alone. @@ -221,10 +221,10 @@ In the last plot, it's a bit hard to tell if 'Spread' is stationary. Let's draw :okwarning: fig = commodities["Spread"].plot(ts = "date") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_commodities_table_plot_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_commodities_table_plot_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_plot_2.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_plot_2.html We can see some sudden changes, so let's smooth the curve. @@ -251,10 +251,10 @@ We can see some sudden changes, so let's smooth the curve. name = "Spread_smooth", ) fig = commodities["Spread_smooth"].plot(ts = "date") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_commodities_table_plot_3.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_commodities_table_plot_3.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_plot_3.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_plot_3.html After each local minimum, there is a local maximum. Let's look at the number of lags needed to keep most of the information. To visualize this, we can draw the autocorrelation function (ACF) and partial autocorrelation function (PACF) plots. @@ -267,10 +267,10 @@ After each local minimum, there is a local maximum. Let's look at the number of :okwarning: fig = commodities.acf(column = "Spread", ts = "date", p = 12) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_commodities_table_plot_acf_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_commodities_table_plot_acf_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_plot_acf_2.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_plot_acf_2.html .. code-block:: python @@ -281,10 +281,10 @@ After each local minimum, there is a local maximum. Let's look at the number of :okwarning: fig = commodities.pacf(column = "Spread", ts = "date", p = 5) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_commodities_table_plot_pacf_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_commodities_table_plot_pacf_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_plot_pacf_2.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_plot_pacf_2.html We can clearly see the influence of the last two values on 'Spread', which makes sense. When the curve slightly changes its direction, it will increase/decrease until reaching a new local maximum/minimum. Only the recent values can help the prediction in case of autoregressive periodical model. The local minimums of interest rate spreads are indicators of an economic crisis. @@ -299,10 +299,10 @@ We saw the correlation between the price-per-barrel of Oil and the time. Let's l :okwarning: fig = commodities["Oil"].plot(ts = "date") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_commodities_table_plot_4.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_commodities_table_plot_4.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_plot_4.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_plot_4.html Moving on to the correlation matrix, we can see many events that changed drastically the values of commodities, and we know of a correlation between all of them. From here, we could look at how strong this correlation is, which will help us create a model that properly combines all the variable lags in its predictions. @@ -315,10 +315,10 @@ Moving on to the correlation matrix, we can see many events that changed drastic :okwarning: fig = commodities.corr(columns = ["Gold", "Oil", "Spread", "Vix", "Dol_Eur", "SP500"]) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_commodities_table_corr_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_commodities_table_corr_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_corr_2.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_corr_2.html We can see strong correlations between most of the variables. A vector autoregression (:py:mod:`~verticapy.machine_learning.vertica.VAR`) model seems ideal. @@ -352,12 +352,12 @@ Let's create the :py:mod:`~verticapy.machine_learning.vertica.VAR` model to pred y = ["Gold", "Oil", "Spread", "Vix", "Dol_Eur", "SP500"], ) res = model.score() - html_file = open("/project/data/VerticaPy/docs/figures/examples_commodities_table_ml_score.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_commodities_table_ml_score.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_ml_score.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_ml_score.html Our model is excellent. Let's predict the values these commodities in the near future. @@ -373,10 +373,10 @@ Gold :okwarning: fig = model.plot(idx = 0, npredictions = 60) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_commodities_table_pred_plot_0.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_commodities_table_pred_plot_0.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_pred_plot_0.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_pred_plot_0.html Oil: +++++ @@ -390,10 +390,10 @@ Oil: :okwarning: fig = model.plot(idx = 1, npredictions = 60) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_commodities_table_pred_plot_1.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_commodities_table_pred_plot_1.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_pred_plot_1.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_pred_plot_1.html Spread: ++++++++ @@ -407,10 +407,10 @@ Spread: :okwarning: fig = model.plot(idx = 2, npredictions = 60) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_commodities_table_pred_plot_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_commodities_table_pred_plot_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_pred_plot_2.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_pred_plot_2.html Vix: +++++ @@ -424,10 +424,10 @@ Vix: :okwarning: fig = model.plot(idx = 3, npredictions = 60) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_commodities_table_pred_plot_3.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_commodities_table_pred_plot_3.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_pred_plot_3.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_pred_plot_3.html Dol_Eur: +++++++++ @@ -441,10 +441,10 @@ Dol_Eur: :okwarning: fig = model.plot(idx = 4, npredictions = 60) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_commodities_table_pred_plot_4.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_commodities_table_pred_plot_4.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_commodities_table_pred_plot_4.html + :file: SPHINX_DIRECTORY/figures/examples_commodities_table_pred_plot_4.html The model performs well but may be somewhat unstable. To improve it, we could apply data preparation techniques, such as seasonal decomposition, before building the :py:mod:`~verticapy.machine_learning.vertica.VAR` model. diff --git a/docs/source/examples_learn_iris.rst b/docs/source/examples_learn_iris.rst index d69ead195..5e8e2ca1d 100644 --- a/docs/source/examples_learn_iris.rst +++ b/docs/source/examples_learn_iris.rst @@ -48,12 +48,12 @@ Let's create a Virtual DataFrame of the dataset. from verticapy.datasets import load_iris iris = load_iris() res = iris.head(5) - html_file = open("/project/data/VerticaPy/docs/figures/examples_iris_table_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_iris_table_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_iris_table_head.html + :file: SPHINX_DIRECTORY/figures/examples_iris_table_head.html Data Exploration and Preparation --------------------------------- @@ -68,12 +68,12 @@ Let's explore the data by displaying descriptive statistics of all the columns. :suppress: res = iris.describe(method = "categorical", unique = True) - html_file = open("/project/data/VerticaPy/docs/figures/examples_iris_table_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_iris_table_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_iris_table_describe.html + :file: SPHINX_DIRECTORY/figures/examples_iris_table_describe.html We don't have much data here, but that's okay; since different flower species have different proportions and ratios between those proportions, we can start by making ratios between each feature. @@ -104,10 +104,10 @@ We can draw the correlation matrix (Pearson correlation coefficient) of the new import verticapy verticapy.set_option("plotting_lib", "plotly") fig = iris.corr(width = 800, height = 800) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_iris_table_corr_matrix.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_iris_table_corr_matrix.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_iris_table_corr_matrix.html + :file: SPHINX_DIRECTORY/figures/examples_iris_table_corr_matrix.html The Iris setosa is highly linearly correlated with the petal length and the sepal ratio. We can see a perfect separation using the two features (though we can also see this separation the petal length alone). @@ -128,10 +128,10 @@ The Iris setosa is highly linearly correlated with the petal length and the sepa width = 800, height = 800, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_iris_scatter_1.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_iris_scatter_1.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_iris_scatter_1.html + :file: SPHINX_DIRECTORY/figures/examples_iris_scatter_1.html We can we a clear linear separation between the Iris setosa and the other species, but we'll need more features to identify the differences between Iris virginica and Iris versicolor. @@ -160,10 +160,10 @@ We can we a clear linear separation between the Iris setosa and the other specie width = 800, height = 800, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_iris_scatter_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_iris_scatter_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_iris_scatter_2.html + :file: SPHINX_DIRECTORY/figures/examples_iris_scatter_2.html Our strategy is simple: we'll use two Linear Support Vector Classification (SVC): one to classify the Iris setosa and another to classify the Iris versicolor. @@ -192,12 +192,12 @@ Let's build the first :py:mod:`~verticapy.machine_learning.vertica.LinearSVC` to response = "Species_Iris-setosa" model = LinearSVC("svc_setosa_iris") res = cross_validate(model, iris, predictors, response) - html_file = open("/project/data/VerticaPy/docs/figures/examples_iris_table_ml_cv.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_iris_table_ml_cv.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_iris_table_ml_cv.html + :file: SPHINX_DIRECTORY/figures/examples_iris_table_ml_cv.html Our model is excellent. Let's build it using the entire dataset. @@ -216,10 +216,10 @@ Let's plot the model to see the perfect separation. :okwarning: fig = model.plot(width = 800, height = 800) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_model_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_model_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_model_plot.html + :file: SPHINX_DIRECTORY/figures/examples_model_plot.html We can add this probability to the :py:mod:`~verticapy.vDataFrame`. @@ -231,12 +231,12 @@ We can add this probability to the :py:mod:`~verticapy.vDataFrame`. :suppress: res = model.predict_proba(iris, name = "setosa", pos_label = 1) - html_file = open("/project/data/VerticaPy/docs/figures/examples_model_predict_proba.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_model_predict_proba.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_model_predict_proba.html + :file: SPHINX_DIRECTORY/figures/examples_model_predict_proba.html Let's create a model to classify the Iris virginica. @@ -268,12 +268,12 @@ Let's create a model to classify the Iris virginica. response = "Species_Iris-virginica" model = LinearSVC("svc_virginica_iris") res = cross_validate(model, iris, predictors, response) - html_file = open("/project/data/VerticaPy/docs/figures/examples_iris_table_ml_cv_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_iris_table_ml_cv_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_iris_table_ml_cv_2.html + :file: SPHINX_DIRECTORY/figures/examples_iris_table_ml_cv_2.html We have another excellent model. Let's add it to the :py:mod:`~verticapy.vDataFrame`. @@ -287,12 +287,12 @@ We have another excellent model. Let's add it to the :py:mod:`~verticapy.vDataFr model.fit(iris, predictors, response) res = model.predict_proba(iris, name = "virginica", pos_label = 1) - html_file = open("/project/data/VerticaPy/docs/figures/examples_model_predict_proba_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_model_predict_proba_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_model_predict_proba_2.html + :file: SPHINX_DIRECTORY/figures/examples_model_predict_proba_2.html Let's evaluate our final model (the combination of two :py:mod:`~verticapy.machine_learning.vertica.LinearSVC`). diff --git a/docs/source/examples_learn_pokemon.rst b/docs/source/examples_learn_pokemon.rst index b8d994b18..1f41c7085 100644 --- a/docs/source/examples_learn_pokemon.rst +++ b/docs/source/examples_learn_pokemon.rst @@ -60,14 +60,14 @@ Let's ingest the datasets. import verticapy.sql.functions as fun - combats = vp.read_csv("/project/data/VerticaPy/docs/source/_static/website/examples/data/pokemon/fights.csv") + combats = vp.read_csv("SPHINX_DIRECTORY/source/_static/website/examples/data/pokemon/fights.csv") res = combats.head(5) - html_file = open("/project/data/VerticaPy/docs/figures/examples_combats_table.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_combats_table.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_combats_table.html + :file: SPHINX_DIRECTORY/figures/examples_combats_table.html .. code-block:: python @@ -77,14 +77,14 @@ Let's ingest the datasets. .. ipython:: python :suppress: - pokemon = vp.read_csv("/project/data/VerticaPy/docs/source/_static/website/examples/data/pokemon/pokemons.csv") + pokemon = vp.read_csv("SPHINX_DIRECTORY/source/_static/website/examples/data/pokemon/pokemons.csv") res = pokemon.head(5) - html_file = open("/project/data/VerticaPy/docs/figures/examples_pokemon_table_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_pokemon_table_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_pokemon_table_2.html + :file: SPHINX_DIRECTORY/figures/examples_pokemon_table_2.html Data Exploration and Preparation --------------------------------- @@ -101,12 +101,12 @@ The 'pokemon' table contains the information on each Pokemon. Let's describe thi :suppress: res = pokemon.describe(method = "categorical", unique = True) - html_file = open("/project/data/VerticaPy/docs/figures/examples_pokemon_table_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_pokemon_table_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_pokemon_table_describe.html + :file: SPHINX_DIRECTORY/figures/examples_pokemon_table_describe.html The pokemon's 'Name', 'Generation', and whether or not it's 'Legendary' will never influence the outcome of the battle, so we can drop these columns. @@ -130,12 +130,12 @@ The pokemon's 'Name', 'Generation', and whether or not it's 'Legendary' will nev "Name", ] ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_pokemon_table_drop.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_pokemon_table_drop.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_pokemon_table_drop.html + :file: SPHINX_DIRECTORY/figures/examples_pokemon_table_drop.html The 'ID' will be the key to join the data. By joining the data, we will be able to create more relevant features. @@ -224,12 +224,12 @@ Missing values can not be handled by most machine learning models. Let's see whi :suppress: res = fights.count() - html_file = open("/project/data/VerticaPy/docs/figures/examples_pokemon_table_clean_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_pokemon_table_clean_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_pokemon_table_clean_1.html + :file: SPHINX_DIRECTORY/figures/examples_pokemon_table_clean_1.html In terms of missing values, our only concern is the Pokemon's second type (Type_2_1 and Type_2_2). Since some Pokemon only have one type, these features are MNAR (missing values not at random). We can impute the missing values by creating another category. @@ -243,12 +243,12 @@ In terms of missing values, our only concern is the Pokemon's second type (Type_ fights["Type_2_1"].fillna("No") res = fights["Type_2_2"].fillna("No") - html_file = open("/project/data/VerticaPy/docs/figures/examples_pokemon_table_clean_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_pokemon_table_clean_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_pokemon_table_clean_2.html + :file: SPHINX_DIRECTORY/figures/examples_pokemon_table_clean_2.html Let's use the current_relation method to see how our data preparation so far on the :py:mod:`~verticapy.vDataFrame` generates SQL code. @@ -270,10 +270,10 @@ Let's look at the correlations between all the variables. import verticapy verticapy.set_option("plotting_lib", "plotly") fig = fights.corr(method = "spearman") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_pokemon_corr.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_pokemon_corr.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_pokemon_corr.html + :file: SPHINX_DIRECTORY/figures/examples_pokemon_corr.html Many variables are correlated to the response column. We have enough information to create our predictive model. @@ -311,12 +311,12 @@ Some really important features are categorical. Random forest can handle them. B nbins = 100, ) res = cross_validate(model, fights, predictors, "Winner") - html_file = open("/project/data/VerticaPy/docs/figures/examples_pokemon_cv.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_pokemon_cv.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_pokemon_cv.html + :file: SPHINX_DIRECTORY/figures/examples_pokemon_cv.html We have an excellent model with an average AUC of more than 99%. Let's create a model with the entire dataset and look at the importance of each feature. @@ -339,10 +339,10 @@ We have an excellent model with an average AUC of more than 99%. Let's create a "Winner", ) fig = model.features_importance() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_pokemon_features_importance_ml.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_pokemon_features_importance_ml.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_pokemon_features_importance_ml.html + :file: SPHINX_DIRECTORY/figures/examples_pokemon_features_importance_ml.html Based on our model, it seems that a Pokemon's speed and attack stats are the strongest predictors for the winner of a battle. diff --git a/docs/source/examples_learn_titanic.rst b/docs/source/examples_learn_titanic.rst index cd03314a8..f348af24e 100644 --- a/docs/source/examples_learn_titanic.rst +++ b/docs/source/examples_learn_titanic.rst @@ -42,12 +42,12 @@ Let's create a Virtual DataFrame of the dataset. from verticapy.datasets import load_titanic titanic = load_titanic() res = titanic.head(5) - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_head.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_head.html Data Exploration and Preparation --------------------------------- @@ -62,12 +62,12 @@ Let's explore the data by displaying descriptive statistics of all the columns. :suppress: res = titanic.describe(method = "categorical", unique = True) - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_describe.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_describe.html The columns "body" (passenger ID), "home.dest" (passenger origin/destination), "embarked" (origin port) and "ticket" (ticket ID) shouldn't influence survival, so we can ignore these. @@ -90,12 +90,12 @@ Let's focus our analysis on the columns "name" and "cabin". We'll begin with the model = CountVectorizer() model.fit(titanic, ["Name"]) res = model.transform() - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_count_vect_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_count_vect_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_count_vect_1.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_count_vect_1.html Passengers' titles might come in handy. We can extract these from their names. @@ -114,12 +114,12 @@ Let's move on to the cabins. model = CountVectorizer() model.fit("titanic", ["cabin"]) res = model.transform() - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_count_vect_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_count_vect_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_count_vect_2.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_count_vect_2.html Here, we have the cabin IDs, the letter of which represents a certain position on the boat. Let's see how often each cabin occurs in the dataset. @@ -140,12 +140,12 @@ Here, we have the cabin IDs, the letter of which represents a certain position o res = model.transform()["token"].str_slice(1, 1).groupby( columns = ["token"], expr = ["SUM(cnt)"] ).head(30) - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_count_vect_3.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_count_vect_3.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_count_vect_3.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_count_vect_3.html While NULL values for "boat" clearly represent passengers who have a dedicated "lifeboat", we can't be so sure about ``NULL`` values for "cabin". We can guess that these might represent passengers without a cabin. If this is the case, then these are missing values not at random (MNAR). @@ -168,12 +168,12 @@ We'll revisit this problem later. For now, let's drop the columns that don't aff ' ([A-Za-z]+)\.')["boat"].fillna( method = "0ifnull" )["cabin"].fillna("No Cabin") - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_drop_clean.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_drop_clean.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_drop_clean.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_drop_clean.html Looking at our data now, we can see that some first class passengers have a ``NULL`` value for their cabin, so we can safely say that our assumption about the meaning of a ``NULL`` value of "cabin" turned out to be incorrect. This means that the "cabin" column has far too many missing values at random (MAR). We'll have to drop it. @@ -185,12 +185,12 @@ Looking at our data now, we can see that some first class passengers have a ``NU :suppress: res = titanic["cabin"].drop() - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_drop_clean_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_drop_clean_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_drop_clean_2.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_drop_clean_2.html Let's look at descriptive statistics of the entire Virtual Dataframe. @@ -202,12 +202,12 @@ Let's look at descriptive statistics of the entire Virtual Dataframe. :suppress: res = titanic.describe(method = "all") - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_describe_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_describe_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_describe_2.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_describe_2.html Descriptive statistics can give us valuable insights into our data. Notice, for example, that the column "fare" has many outliers (The maximum of 512.33 is much greater than the 9th decile of 79.13). Most passengers traveled in 3rd class (median of pclass = 3). @@ -234,12 +234,12 @@ Let's move on to outliers. We have several tools for locating outliers (:py:mod: method = "winsorize", alpha = 0.03, ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_drop_clean_3.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_drop_clean_3.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_drop_clean_3.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_drop_clean_3.html Let's encode the column "sex" so we can use it with numerical methods. @@ -252,12 +252,12 @@ Let's encode the column "sex" so we can use it with numerical methods. :okwarning: res = titanic["sex"].label_encode() - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_drop_clean_4.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_drop_clean_4.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_drop_clean_4.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_drop_clean_4.html The column "age" has too many missing values and since most machine learning algorithms can't handle missing values, we need to impute our data. Let's fill the missing values using the average "age" of the passengers that have the same "pclass" and "sex". @@ -270,12 +270,12 @@ The column "age" has too many missing values and since most machine learning alg :okwarning: res = titanic["age"].fillna(method = "mean", by = ["pclass", "sex"]) - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_drop_clean_5.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_drop_clean_5.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_drop_clean_5.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_drop_clean_5.html Let's draw the correlation matrix to see the links between variables. @@ -290,10 +290,10 @@ Let's draw the correlation matrix to see the links between variables. import verticapy verticapy.set_option("plotting_lib", "plotly") fig = titanic.corr(method = "spearman", width = 800, height = 800) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_titanic_table_corr_matrix.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_titanic_table_corr_matrix.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_corr_matrix.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_corr_matrix.html Fare correlates strongly with family size. This is about what you would expect: a larger family means more tickets, and more tickets means a greater fare. @@ -354,12 +354,12 @@ First, let's look at the number of survivors. :okwarning: res = titanic_boat["survived"].describe() - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_with_boat.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_with_boat.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_with_boat.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_with_boat.html We have nine deaths. Let's try to understand why these passengers died. @@ -372,12 +372,12 @@ We have nine deaths. Let's try to understand why these passengers died. :okwarning: res = titanic_boat.search(titanic_boat["survived"] == 0).head(10) - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_with_boat_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_with_boat_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_with_boat_2.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_with_boat_2.html Apart from a decent amount of these passengers being third-class passengers, it doesn't seem like there are any clear predictors here for their deaths. Making a model from this would be unhelpful. @@ -396,12 +396,12 @@ Let's move on to passengers without a lifeboat. :okwarning: res = titanic_no_boat["survived"].describe() - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_without_boat.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_without_boat.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_without_boat.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_without_boat.html Only 20 survived. Let's find out why. @@ -414,12 +414,12 @@ Only 20 survived. Let's find out why. :okwarning: res = titanic_no_boat.search(titanic_boat["survived"] == 1).head(20) - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_without_boat_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_without_boat_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_without_boat_2.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_without_boat_2.html Most survivors seem to be women. Let's build a model with this in mind. @@ -452,12 +452,12 @@ One of our predictors is categorical: the passenger title. Some of these predict max_depth = 4, ) res = cross_validate(model, titanic_no_boat, predictors, response) - html_file = open("/project/data/VerticaPy/docs/figures/examples_titanic_table_ml_cv.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_titanic_table_ml_cv.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_ml_cv.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_ml_cv.html This dataset is pretty unbalanced so we'll use an AUC to evaluate it. Looking at our table, our model has an average AUC of more than 0.9, so our model is quite good. @@ -480,10 +480,10 @@ Let's look at the importance of each feature. import verticapy verticapy.set_option("plotting_lib", "plotly") fig = model.features_importance() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_titanic_table_features.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_titanic_table_features.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_titanic_table_features.html + :file: SPHINX_DIRECTORY/figures/examples_titanic_table_features.html As expected, the passenger's title is the most important predictors of survival. diff --git a/docs/source/examples_learn_winequality.rst b/docs/source/examples_learn_winequality.rst index 5efde36b4..c954b1510 100644 --- a/docs/source/examples_learn_winequality.rst +++ b/docs/source/examples_learn_winequality.rst @@ -54,12 +54,12 @@ Let's create a Virtual DataFrame of the dataset. from verticapy.datasets import load_winequality winequality = load_winequality() res = winequality.head(5) - html_file = open("/project/data/VerticaPy/docs/figures/examples_winequality_table_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_winequality_table_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_winequality_table_head.html + :file: SPHINX_DIRECTORY/figures/examples_winequality_table_head.html Data Exploration and Preparation ---------------------------------- @@ -74,12 +74,12 @@ Let's explore the data by displaying descriptive statistics of all the columns. :suppress: res = winequality.describe() - html_file = open("/project/data/VerticaPy/docs/figures/examples_winequality_table_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_winequality_table_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_winequality_table_describe.html + :file: SPHINX_DIRECTORY/figures/examples_winequality_table_describe.html The quality of a wine is based on the equilibrium between certain components: - **For red wines:** tannin/smoothness/acidity @@ -97,12 +97,12 @@ We do, however, have enough data to make a good model for white wines, so let's winequality.filter(winequality["color"] == 'white').drop(["good", "color"]) res = winequality - html_file = open("/project/data/VerticaPy/docs/figures/examples_winequality_table_filter.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_winequality_table_filter.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_winequality_table_filter.html + :file: SPHINX_DIRECTORY/figures/examples_winequality_table_filter.html Let's draw the correlation matrix of the dataset. @@ -116,10 +116,10 @@ Let's draw the correlation matrix of the dataset. import verticapy verticapy.set_option("plotting_lib", "plotly") fig = winequality.corr(method = "spearman", width = 800, height = 800) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_winequality_table_corr_matrix.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_winequality_table_corr_matrix.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_winequality_table_corr_matrix.html + :file: SPHINX_DIRECTORY/figures/examples_winequality_table_corr_matrix.html We can see a strong correlation between the density and the alcohol degree (the alcohol degree describes the density of pure ethanol in the wine). @@ -134,12 +134,12 @@ We can drop the 'density' column since it doesn't influence the quality of the w winequality.drop(["density"]) res = winequality - html_file = open("/project/data/VerticaPy/docs/figures/examples_winequality_table_drop.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_winequality_table_drop.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_winequality_table_drop.html + :file: SPHINX_DIRECTORY/figures/examples_winequality_table_drop.html We're working with the scores given by wine tasters, so it's likely that two closely competing wines will have a similar score. Knowing this, a ``k-nearest neighbors`` (KNN) model would be best. @@ -183,12 +183,12 @@ KNN is sensitive to unnormalized data so we'll have to normalize our data. method = "robust_zscore", ) res = winequality - html_file = open("/project/data/VerticaPy/docs/figures/examples_winequality_table_normalize.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_winequality_table_normalize.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_winequality_table_normalize.html + :file: SPHINX_DIRECTORY/figures/examples_winequality_table_normalize.html Machine Learning ----------------- @@ -213,12 +213,12 @@ Let's create our KNN model. predictors = winequality.get_columns(exclude_columns = ["quality"]) model = KNeighborsRegressor(name = "winequality_KNN", n_neighbors = 50) res = cross_validate(model, winequality, predictors, "quality") - html_file = open("/project/data/VerticaPy/docs/figures/examples_winequality_table_ml_cv.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_winequality_table_ml_cv.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_winequality_table_ml_cv.html + :file: SPHINX_DIRECTORY/figures/examples_winequality_table_ml_cv.html Our model is pretty good. Our predicted scores have a median absolute error of less than 0.5. If we want to improve this model, we'll probably need more relevant features. diff --git a/docs/source/examples_understand_africa_education.rst b/docs/source/examples_understand_africa_education.rst index 179abfb46..9ac4b7eef 100644 --- a/docs/source/examples_understand_africa_education.rst +++ b/docs/source/examples_understand_africa_education.rst @@ -121,10 +121,10 @@ Remember our goal: find a way to predict students' final scores ('zralocp' & 'zm verticapy.set_option("plotting_lib", "plotly") africa = africa.sample(x = 0.1) fig = africa.corr(width = 900) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_africe_corr_matrix.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_africe_corr_matrix.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africe_corr_matrix.html + :file: SPHINX_DIRECTORY/figures/examples_africe_corr_matrix.html Some variables are useless because they are categorizations of others. For example, most scores can go from 0 to 1000, and some variables are created by mapping these variables to a reduced interval (for example: 0 to 10), so we can drop them. @@ -168,12 +168,12 @@ Let's take a look at the missing values. :suppress: res = africa.count_percent() - html_file = open("/project/data/VerticaPy/docs/figures/examples_africa_count_percent.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_africa_count_percent.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_count_percent.html + :file: SPHINX_DIRECTORY/figures/examples_africa_count_percent.html Many values are missing for 'zraloct' which is the teachers' test score. We need to find a way to impute them as they represent more than 10% of the dataset. For the others that represent less than 5% of the dataset, our goal is to identify what improves student performance, so we can filter them. @@ -198,12 +198,12 @@ We'll use two variables to impute the teachers' scores: TEACHER'S SEX (XSEX) and ) africa.dropna() res = africa - html_file = open("/project/data/VerticaPy/docs/figures/examples_africa_after_drop.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_africa_after_drop.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_after_drop.html + :file: SPHINX_DIRECTORY/figures/examples_africa_after_drop.html Now that we have a clean dataset, we can use a Random Forest Regressor to understand what tends to influence the a student's final score. @@ -239,10 +239,10 @@ These clusters can be used as inputs by our model. n_cluster = (1, 30), show = True, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_africa_elbow.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_africa_elbow.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_elbow.html + :file: SPHINX_DIRECTORY/figures/examples_africa_elbow.html Eight seems to be a suitable number of clusters. Let's compute a ``k-means`` model. @@ -428,12 +428,12 @@ Our model is excellent. Let's create one for the students' standardized reading response, ) res = model_africa_rf_zralocp.regression_report() - html_file = open("/project/data/VerticaPy/docs/figures/examples_africa_reg_report_zralocp.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_africa_reg_report_zralocp.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_reg_report_zralocp.html + :file: SPHINX_DIRECTORY/figures/examples_africa_reg_report_zralocp.html We'll also create one for the students' standardized mathematics score ('zmalocp'). @@ -460,12 +460,12 @@ We'll also create one for the students' standardized mathematics score ('zmalocp response, ) res = model_africa_rf_zmalocp.regression_report() - html_file = open("/project/data/VerticaPy/docs/figures/examples_africa_reg_report_zmalocp.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_africa_reg_report_zmalocp.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_reg_report_zmalocp.html + :file: SPHINX_DIRECTORY/figures/examples_africa_reg_report_zmalocp.html Let's look at the feature importance for each model. @@ -479,10 +479,10 @@ Let's look at the feature importance for each model. vp.set_option("plotting_lib", "plotly") fig = model_africa_rf_zralocp.features_importance() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_africa_feature_zralocp.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_africa_feature_zralocp.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_feature_zralocp.html + :file: SPHINX_DIRECTORY/figures/examples_africa_feature_zralocp.html .. code-block:: python @@ -493,10 +493,10 @@ Let's look at the feature importance for each model. :okwarning: fig = model_africa_rf_zmalocp.features_importance() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_africa_feature_zmalocp.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_africa_feature_zmalocp.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_feature_zmalocp.html + :file: SPHINX_DIRECTORY/figures/examples_africa_feature_zmalocp.html Feature importance between between math score and the reading score are almost identical. @@ -540,10 +540,10 @@ Let's visualize our model. We begin by creating a bubble plot using the two scor by = "PENGLISH", max_nb_points = 2000, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_africa_scatter_bubble.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_africa_scatter_bubble.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_scatter_bubble.html + :file: SPHINX_DIRECTORY/figures/examples_africa_scatter_bubble.html Notable influences are home language and the socioeconomic status. It seems like students that both speak Engish at home often (but not all the time) and have a comfortable standard of living tend to perform the best. @@ -569,10 +569,10 @@ Now, let's see how a student's nationality might affect their performance. max_cardinality = 50, width = 800, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_africa_bar_90_country_long.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_africa_bar_90_country_long.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_bar_90_country_long.html + :file: SPHINX_DIRECTORY/figures/examples_africa_bar_90_country_long.html .. code-block:: python @@ -594,10 +594,10 @@ Now, let's see how a student's nationality might affect their performance. max_cardinality = 50, width = 800, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_africa_bar_10_country_long.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_africa_bar_10_country_long.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_bar_10_country_long.html + :file: SPHINX_DIRECTORY/figures/examples_africa_bar_10_country_long.html The students' nationalities seem to have big impact. For example, Swaziland, Kenya, and Tanzanie are probably @@ -628,10 +628,10 @@ be harder than the others. Let's break this down by region. max_cardinality = 50, width = 1000, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_africa_bar_district.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_africa_bar_district.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_bar_district.html + :file: SPHINX_DIRECTORY/figures/examples_africa_bar_district.html The same applies to the regions. Let's look at student age. @@ -653,10 +653,10 @@ The same applies to the regions. Let's look at student age. of = "pred_zmalocp", max_cardinality = 50, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_africa_bar_page.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_africa_bar_page.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_bar_page.html + :file: SPHINX_DIRECTORY/figures/examples_africa_bar_page.html Let's look at the the variables 'PLIGHT' (a student's main lighting source) and 'PREPEAT' (repeated years). @@ -679,10 +679,10 @@ Let's look at the the variables 'PLIGHT' (a student's main lighting source) and of = "pred_zmalocp", width = 850, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_africa_bar_prepeat_plight.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_africa_bar_prepeat_plight.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_bar_prepeat_plight.html + :file: SPHINX_DIRECTORY/figures/examples_africa_bar_prepeat_plight.html We can see that students who never repeated a year and have light at home tend to do better in school than those who don't. @@ -706,10 +706,10 @@ Another factor in a student's performance might be their method of transportatio of = "pred_zmalocp", width = 850, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_africa_bar_ptravel2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_africa_bar_ptravel2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_bar_ptravel2.html + :file: SPHINX_DIRECTORY/figures/examples_africa_bar_ptravel2.html We can clearly see that the more inconvenient it is to get to school, the worse students tend to perform. @@ -737,10 +737,10 @@ Let's look at the influence of the 'district'. of = "pred_zmalocp", h = 100, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_africa_bar_district_50_pred.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_africa_bar_district_50_pred.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_bar_district_50_pred.html + :file: SPHINX_DIRECTORY/figures/examples_africa_bar_district_50_pred.html Here, we can see that Chicualacuala has a very high median score, so we can conclude that a students' district might impact their performance in school. @@ -912,10 +912,10 @@ Let's create a logistic regression to understand what circumstances allowed thes model_africa_logit_best = LogisticRegression(name="africa_logit_best",solver="BFGS") model_africa_logit_best.fit(africa,predictors,"best") fig = model_africa_logit_best.features_importance() - fig.write_html("/project/data/VerticaPy/docs/figures/examples_africa_feature_final.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_africa_feature_final.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_africa_feature_final.html + :file: SPHINX_DIRECTORY/figures/examples_africa_feature_final.html We can see that the best students tend to be young, speak English at home, come from a good socioeconomic background, have a father with a degree, and live relatively close to school. diff --git a/docs/source/examples_understand_amazon.rst b/docs/source/examples_understand_amazon.rst index 73bf39b98..bc7c14f38 100644 --- a/docs/source/examples_understand_amazon.rst +++ b/docs/source/examples_understand_amazon.rst @@ -46,12 +46,12 @@ Let's create a Virtual DataFrame of the dataset. amazon = load_amazon() res = amazon.head(5) - html_file = open("/project/data/VerticaPy/docs/figures/examples_amazon_table_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_amazon_table_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_amazon_table_head.html + :file: SPHINX_DIRECTORY/figures/examples_amazon_table_head.html Data Exploration and Preparation --------------------------------- @@ -66,12 +66,12 @@ We can explore our data by displaying descriptive statistics of all the columns. :suppress: res = amazon.describe(method = "categorical", unique = True) - html_file = open("/project/data/VerticaPy/docs/figures/examples_amazon_table_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_amazon_table_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_amazon_table_describe.html + :file: SPHINX_DIRECTORY/figures/examples_amazon_table_describe.html Using the :py:func:`~verticapy.vDataFrame.describe` method, we can see that our data ranges from the beginning of 1998 to the end of 2017. @@ -83,12 +83,12 @@ Using the :py:func:`~verticapy.vDataFrame.describe` method, we can see that our :suppress: res = amazon["date"].describe() - html_file = open("/project/data/VerticaPy/docs/figures/examples_amazon_table_describe_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_amazon_table_describe_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_amazon_table_describe_2.html + :file: SPHINX_DIRECTORY/figures/examples_amazon_table_describe_2.html Brazil has dry and rainy seasons. Knowing this, we would expect that the frequency of forest fires vary between seasons. Let's confirm our hypothesis using an autocorrelation plot with 48 lags (4 years). @@ -112,10 +112,10 @@ Brazil has dry and rainy seasons. Knowing this, we would expect that the frequen by = ["state"], p = 48, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_amazon_table_acf.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_amazon_table_acf.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_amazon_table_acf.html + :file: SPHINX_DIRECTORY/figures/examples_amazon_table_acf.html The process is not stationary. Let's use a Dickey-Fuller test to confirm our hypothesis. @@ -143,12 +143,12 @@ The process is not stationary. Let's use a Dickey-Fuller test to confirm our hyp by = ["state"], p = 48, ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_amazon_adfuller.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_amazon_adfuller.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_amazon_adfuller.html + :file: SPHINX_DIRECTORY/figures/examples_amazon_adfuller.html The effects of each season seem pretty clear. We can see this graphically using the cumulative sum of the number of forest fires partitioned by states. If our hypothesis is correct, we should see staircase functions. @@ -180,10 +180,10 @@ The effects of each season seem pretty clear. We can see this graphically using ts = "date", by = "state", ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_amazon_table_cum_sum.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_amazon_table_cum_sum.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_amazon_table_cum_sum.html + :file: SPHINX_DIRECTORY/figures/examples_amazon_table_cum_sum.html We can clearly observe the seasonality within each state, which contributes to an overall global seasonality. Let's plot the total number of forest fires to illustrate this more clearly. @@ -214,10 +214,10 @@ We can clearly observe the seasonality within each state, which contributes to a ], ) fig = amazon["number"].plot(ts = "date") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_amazon_table_plot_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_amazon_table_plot_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_amazon_table_plot_2.html + :file: SPHINX_DIRECTORY/figures/examples_amazon_table_plot_2.html Although it would be preferable to use seasonal decomposition and predict the residuals, let's build an ARIMA model on the data. @@ -256,12 +256,12 @@ Since the seasonality occurs monthly, we set ``p = 12``. There is no trend in th ts = "date", ) res = model.regression_report(start = 50) - html_file = open("/project/data/VerticaPy/docs/figures/examples_amazon_table_ml_cv.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_amazon_table_ml_cv.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_amazon_table_ml_cv.html + :file: SPHINX_DIRECTORY/figures/examples_amazon_table_ml_cv.html Our model is quite good. Let's look at our predictions. @@ -287,10 +287,10 @@ Our model is quite good. Let's look at our predictions. npredictions = 40, method = "auto", ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_amazon_table_plot_ml_2.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_amazon_table_plot_ml_2.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_amazon_table_plot_ml_2.html + :file: SPHINX_DIRECTORY/figures/examples_amazon_table_plot_ml_2.html The plot shows that our model has successfully captured the seasonality present in the data. However, to improve the model, we should remove the seasonality and focus on predicting the residuals directly. The current model is not entirely stable and requires further adjustments. diff --git a/docs/source/examples_understand_covid19.rst b/docs/source/examples_understand_covid19.rst index 33a055e49..74615346c 100644 --- a/docs/source/examples_understand_covid19.rst +++ b/docs/source/examples_understand_covid19.rst @@ -45,14 +45,14 @@ Let's create a Virtual DataFrame of the dataset. The dataset is available `here .. ipython:: python :suppress: - covid19 = vp.read_csv("/project/data/VerticaPy/docs/source/_static/website/examples/data/covid19/deaths.csv") + covid19 = vp.read_csv("SPHINX_DIRECTORY/source/_static/website/examples/data/covid19/deaths.csv") res = covid19.head(10) - html_file = open("/project/data/VerticaPy/docs/figures/examples_commodities_table_head.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_commodities_table_head.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_table_head.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_table_head.html Data Exploration and Preparation --------------------------------- @@ -67,12 +67,12 @@ Let's explore the data by displaying descriptive statistics of all the columns. :suppress: res = covid19.describe(method = "categorical", unique = True) - html_file = open("/project/data/VerticaPy/docs/figures/examples_covid19_table_describe.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_covid19_table_describe.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_table_describe.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_table_describe.html We have data from January 2020 to the beginning of May. @@ -84,12 +84,12 @@ We have data from January 2020 to the beginning of May. :suppress: res = covid19["date"].describe() - html_file = open("/project/data/VerticaPy/docs/figures/examples_covid19_table_describe_2.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_covid19_table_describe_2.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_table_describe_2.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_table_describe_2.html We'll try to predict the number of future deaths by using the statistics from previous days. We can drop the columns 'county' and 'fips', since the scope of our analysis is focused on the United States and the FIPS code isn't relevant to our predictions. @@ -101,12 +101,12 @@ We'll try to predict the number of future deaths by using the statistics from pr :suppress: res = covid19.drop(["fips", "county"]) - html_file = open("/project/data/VerticaPy/docs/figures/examples_covid19_table_drop_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_covid19_table_drop_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_table_drop_1.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_table_drop_1.html Let's sum the number of deaths and cases by state and date. @@ -142,12 +142,12 @@ Let's sum the number of deaths and cases by state and date. ], ) res = covid19.head(10) - html_file = open("/project/data/VerticaPy/docs/figures/examples_covid19_table_clean_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_covid19_table_clean_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_table_clean_1.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_table_clean_1.html Let's look at the autocorrelation graphic of the number of deaths. @@ -171,10 +171,10 @@ Let's look at the autocorrelation graphic of the number of deaths. by = ["state"], p = 24, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_covid19_table_plot_acf.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_covid19_table_plot_acf.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_table_plot_acf.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_table_plot_acf.html The process doesn't seem to be stationary. Let's use a Dickey-Fuller test to confirm our hypothesis. @@ -203,12 +203,12 @@ The process doesn't seem to be stationary. Let's use a Dickey-Fuller test to con by = ["state"], p = 12, ) - html_file = open("/project/data/VerticaPy/docs/figures/examples_covid19_adfuller_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_covid19_adfuller_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_adfuller_1.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_adfuller_1.html We can look at the cumulative number of deaths and its exponentiality. @@ -226,10 +226,10 @@ We can look at the cumulative number of deaths and its exponentiality. ts = "date", by = "state", ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_covid19_table_plot_3.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_covid19_table_plot_3.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_table_plot_3.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_table_plot_3.html Let's plot this for the entire country. @@ -249,10 +249,10 @@ Let's plot this for the entire country. [fun.sum(covid19["deaths"])._as("deaths")], ) fig = covid["deaths"].plot(ts = "date") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_covid19_table_plot_4.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_covid19_table_plot_4.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_table_plot_4.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_table_plot_4.html As you would expect, there's a clear correlation between the number of people infected and the number of deaths. @@ -270,12 +270,12 @@ A vector autoregression (VAR) model can be very good to do the predictions. But :suppress: res = covid19["state"].one_hot_encode() - html_file = open("/project/data/VerticaPy/docs/figures/examples_covid19_one_hot_encode_1.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_covid19_one_hot_encode_1.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_one_hot_encode_1.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_one_hot_encode_1.html Because of the upward monotonic trend, we can also look at the correlation between the days elapsed and the number of cases. @@ -302,10 +302,10 @@ Let's see the correlation between the number of deaths and the other variables. :suppress: fig = covid19.corr(focus = "deaths") - fig.write_html("/project/data/VerticaPy/docs/figures/examples_covid19_table_plot_corr_5.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_covid19_table_plot_corr_5.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_table_plot_corr_5.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_table_plot_corr_5.html We can see clearly a high correlation for some variables. We can use them to compute a ``SARIMAX`` model, but we'll stick to a :py:mod:`~verticapy.machine_learning.vertica.VAR` model for this study. @@ -324,7 +324,7 @@ Let's compute the total number of deaths and cases to create our VAR model. .. ipython:: python :suppress: - covid19 = vp.read_csv("/project/data/VerticaPy/docs/source/_static/website/examples/data/covid19/deaths.csv").groupby( + covid19 = vp.read_csv("SPHINX_DIRECTORY/source/_static/website/examples/data/covid19/deaths.csv").groupby( ["date"], [ fun.sum(covid19["deaths"])._as("deaths"), @@ -364,12 +364,12 @@ Let's create a :py:mod:`~verticapy.machine_learning.vertica.VAR` model to predic return_report = True, ) res = model.score(start = 20) - html_file = open("/project/data/VerticaPy/docs/figures/examples_covid19_table_ml_score.html", "w") + html_file = open("SPHINX_DIRECTORY/figures/examples_covid19_table_ml_score.html", "w") html_file.write(res._repr_html_()) html_file.close() .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_table_ml_score.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_table_ml_score.html Our model is not bad. Let's predict the number of deaths in a near future. @@ -395,10 +395,10 @@ Cases: npredictions = 10, idx = 0, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_covid19_table_pred_plot_0.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_covid19_table_pred_plot_0.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_table_pred_plot_0.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_table_pred_plot_0.html Deaths: ++++++++ @@ -422,10 +422,10 @@ Deaths: npredictions = 10, idx = 1, ) - fig.write_html("/project/data/VerticaPy/docs/figures/examples_covid19_table_pred_plot_1.html") + fig.write_html("SPHINX_DIRECTORY/figures/examples_covid19_table_pred_plot_1.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/examples_covid19_table_pred_plot_1.html + :file: SPHINX_DIRECTORY/figures/examples_covid19_table_pred_plot_1.html The model performs well but may be somewhat unstable. To improve it, we could apply data preparation techniques, such as seasonal decomposition, before building the VAR model. diff --git a/docs/source/user_guide_data_exploration_charts.rst b/docs/source/user_guide_data_exploration_charts.rst index 2869d8480..55f2695f7 100644 --- a/docs/source/user_guide_data_exploration_charts.rst +++ b/docs/source/user_guide_data_exploration_charts.rst @@ -46,7 +46,7 @@ Let's start with pies and histograms. Drawing the pie or histogram of a categori file.write(html_text) .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_bar.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_bar.html .. code-block:: @@ -61,7 +61,7 @@ Let's start with pies and histograms. Drawing the pie or histogram of a categori file.write(html_text) .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_pie.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_pie.html .. code-block:: @@ -76,7 +76,7 @@ Let's start with pies and histograms. Drawing the pie or histogram of a categori file.write(html_text) .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_home_dest_bar.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_home_dest_bar.html These methods will draw the most occurent categories and merge the others. To change the number of elements, you can use the ``max_cardinality`` parameter. @@ -93,7 +93,7 @@ These methods will draw the most occurent categories and merge the others. To ch file.write(html_text) .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_home_dest_bar_max_cardinality.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_home_dest_bar_max_cardinality.html When dealing with numerical data types, the process is different. Vertica needs to discretize the numerical features to draw them. You can choose the bar width (``h`` parameter) or let VerticaPy compute an optimal width using the Freedman-Diaconis rule. @@ -110,7 +110,7 @@ When dealing with numerical data types, the process is different. Vertica needs file.write(html_text) .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_age_hist.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_age_hist.html .. code-block:: @@ -125,7 +125,7 @@ When dealing with numerical data types, the process is different. Vertica needs file.write(html_text) .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_age_hist_h5.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_age_hist_h5.html You can also change the occurences by another aggregation with the ``method`` and ``of`` parameters. @@ -142,7 +142,7 @@ You can also change the occurences by another aggregation with the ``method`` an file.write(html_text) .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_age_hist_avs.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_age_hist_avs.html VerticaPy uses the same process for other graphics, like 2-dimensional histograms and bar charts. @@ -161,10 +161,10 @@ Let us showcase another plotting library for these plots. # Setting the plotting lib vp.set_option("plotting_lib", "plotly") fig = titanic.bar(["pclass", "survived"]) - fig.write_html("/project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_bar_pclass_surv.html") + fig.write_html("SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_bar_pclass_surv.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_bar_pclass_surv.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_bar_pclass_surv.html .. note:: VerticaPy has three main plotting libraries. Look at :ref:`chart_gallery` section for all the different plots. @@ -184,10 +184,10 @@ Let us showcase another plotting library for these plots. method = "avg", of = "survived", ) - fig.write_html("/project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_bar_pclass_fare.html") + fig.write_html("SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_bar_pclass_fare.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_bar_pclass_fare.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_bar_pclass_fare.html Pivot tables give us aggregated information for every category and are more powerful than histograms or bar charts. @@ -210,10 +210,10 @@ Pivot tables give us aggregated information for every category and are more powe of = "survived", fill_none = np.nan, ) - fig.write_html("/project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_bar_pclass_fare_fill.html") + fig.write_html("SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_bar_pclass_fare_fill.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_bar_pclass_fare_fill.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_bar_pclass_fare_fill.html Box plots are useful for understanding statistical dispersion. @@ -226,10 +226,10 @@ Box plots are useful for understanding statistical dispersion. :okwarning: fig = titanic.boxplot(columns = ["age", "fare"]) - fig.write_html("/project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_boxplot.html") + fig.write_html("SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_boxplot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_boxplot.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_boxplot.html .. code-block:: @@ -240,10 +240,10 @@ Box plots are useful for understanding statistical dispersion. :okwarning: fig = titanic["age"].boxplot() - fig.write_html("/project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_boxplot_one.html") + fig.write_html("SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_boxplot_one.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_titanic_boxplot_one.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_titanic_boxplot_one.html Scatter and bubble plots are also useful for identifying patterns in your data. Note, however, that these methods don't use aggregations; VerticaPy downsamples the data before plotting. You can use the ``max_nb_points`` to limit the number of points and avoid unnecessary memory usage. @@ -266,10 +266,10 @@ Scatter and bubble plots are also useful for identifying patterns in your data. by = "Species", max_nb_points = 1000, ) - fig.write_html("/project/data/VerticaPy/docs/figures/user_guides_data_exploration_iris_scatter.html") + fig.write_html("SPHINX_DIRECTORY/figures/user_guides_data_exploration_iris_scatter.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_iris_scatter.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_iris_scatter.html Now, let us look at a 3D scatter plot. @@ -290,10 +290,10 @@ Now, let us look at a 3D scatter plot. by = "Species", max_nb_points = 1000, ) - fig.write_html("/project/data/VerticaPy/docs/figures/user_guides_data_exploration_iris_scatter_3d.html") + fig.write_html("SPHINX_DIRECTORY/figures/user_guides_data_exploration_iris_scatter_3d.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_iris_scatter_3d.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_iris_scatter_3d.html Similarly, we can plot a bubble plot: @@ -316,10 +316,10 @@ Similarly, we can plot a bubble plot: by = "Species", max_nb_points = 1000, ) - fig.write_html("/project/data/VerticaPy/docs/figures/user_guides_data_exploration_iris_scatter_bubble.html") + fig.write_html("SPHINX_DIRECTORY/figures/user_guides_data_exploration_iris_scatter_bubble.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_iris_scatter_bubble.html + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_iris_scatter_bubble.html For more information on scatter look at :py:func:`verticapy.vDataFrame.scatter`. @@ -362,7 +362,7 @@ Hexbin, scatter, and bubble plots also allow you to provide a background image. ["lon", "lat"], method = "avg", of = "zralocp", - img = "/project/data/VerticaPy/docs/source/_static/website/user_guides/data_exploration/africa.png" + img = "SPHINX_DIRECTORY/source/_static/website/user_guides/data_exploration/africa.png" ) It is also possible to use SHP datasets to draw maps. @@ -416,7 +416,7 @@ Since time-series plots do not aggregate the data, it's important to choose the by = "state", start_date = "2010-01-01", ) - fig.write_html("/project/data/VerticaPy/docs/figures/user_guides_data_exploration_amazon_time_plot.html") + fig.write_html("SPHINX_DIRECTORY/figures/user_guides_data_exploration_amazon_time_plot.html") .. raw:: html - :file: /project/data/VerticaPy/docs/figures/user_guides_data_exploration_amazon_time_plot.html \ No newline at end of file + :file: SPHINX_DIRECTORY/figures/user_guides_data_exploration_amazon_time_plot.html \ No newline at end of file diff --git a/docs/source/user_guide_data_exploration_correlations.rst b/docs/source/user_guide_data_exploration_correlations.rst index 1ef1c2ecc..bc97bfe23 100644 --- a/docs/source/user_guide_data_exploration_correlations.rst +++ b/docs/source/user_guide_data_exploration_correlations.rst @@ -22,15 +22,15 @@ Let's use the `Telco Churn dataset