diff --git a/advanced_tutorials/air_quality/1_air_quality_feature_backfill.ipynb b/advanced_tutorials/air_quality/1_air_quality_feature_backfill.ipynb index 75f1b875..5ad876f9 100644 --- a/advanced_tutorials/air_quality/1_air_quality_feature_backfill.ipynb +++ b/advanced_tutorials/air_quality/1_air_quality_feature_backfill.ipynb @@ -1131,7 +1131,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1145,7 +1145,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.8.18" } }, "nbformat": 4, diff --git a/advanced_tutorials/aml/2_aml_training_pipeline.ipynb b/advanced_tutorials/aml/2_aml_training_pipeline.ipynb index 6737cafe..cc9dfad5 100644 --- a/advanced_tutorials/aml/2_aml_training_pipeline.ipynb +++ b/advanced_tutorials/aml/2_aml_training_pipeline.ipynb @@ -151,16 +151,16 @@ "min_max_scaler = fs.get_transformation_function(name=\"min_max_scaler\")\n", "\n", "# Map features to transformations.\n", - "transformation_functions = {\n", - " \"monthly_in_count\": min_max_scaler,\n", - " \"monthly_in_total_amount\": min_max_scaler,\n", - " \"monthly_in_mean_amount\": min_max_scaler,\n", - " \"monthly_in_std_amount\": min_max_scaler,\n", - " \"monthly_out_count\": min_max_scaler,\n", - " \"monthly_out_total_amount\": min_max_scaler,\n", - " \"monthly_out_mean_amount\": min_max_scaler,\n", - " \"monthly_out_std_amount\": min_max_scaler,\n", - "}" + "transformation_functions = [\n", + " min_max_scaler(\"monthly_in_count\"),\n", + " min_max_scaler(\"monthly_in_total_amount\"),\n", + " min_max_scaler(\"monthly_in_mean_amount\"),\n", + " min_max_scaler(\"monthly_in_std_amount\"),\n", + " min_max_scaler(\"monthly_out_count\"),\n", + " min_max_scaler(\"monthly_out_total_amount\"),\n", + " min_max_scaler(\"monthly_out_mean_amount\"),\n", + " min_max_scaler(\"monthly_out_std_amount\"),\n", + "]" ] }, { diff --git a/advanced_tutorials/bitcoin/3_bitcoin_training_pipeline.ipynb b/advanced_tutorials/bitcoin/3_bitcoin_training_pipeline.ipynb index 2678ec60..f858baaa 100644 --- a/advanced_tutorials/bitcoin/3_bitcoin_training_pipeline.ipynb +++ b/advanced_tutorials/bitcoin/3_bitcoin_training_pipeline.ipynb @@ -214,7 +214,7 @@ "min_max_scaler = fs.get_transformation_function(name=\"min_max_scaler\")\n", "\n", "# Map features to transformation functions.\n", - "transformation_functions = {col: min_max_scaler for col in columns_to_transform}" + "transformation_functions = [min_max_scaler(col) for col in columns_to_transform]" ] }, { @@ -522,22 +522,22 @@ "metadata": {}, "outputs": [], "source": [ - "import inspect \n", - "# Recall that you applied transformation functions, such as min max scaler and laber encoder. \n", - "# Now you want to transform them back to human readable format.\n", + "# Initializing serving\n", "feature_view.init_serving(1)\n", - "td_transformation_functions = feature_view._single_vector_server._transformation_functions\n", + "\n", + "# Accessing the transformation functions used in the serving configuration\n", + "fv_transformation_functions = feature_view._vector_server.model_dependent_transformation_functions\n", "\n", "y_pred = pd.DataFrame(y_pred_scaled, columns=[\"close\"])\n", "\n", - "for feature_name in td_transformation_functions:\n", - " if feature_name == \"close\":\n", - " td_transformation_function = td_transformation_functions[feature_name]\n", - " sig, foobar_locals = inspect.signature(td_transformation_function.transformation_fn), locals()\n", - " param_dict = dict([(param.name, param.default) for param in sig.parameters.values() if param.default != inspect._empty])\n", - " if td_transformation_function.name == \"min_max_scaler\":\n", - " y_pred[feature_name] = y_pred[feature_name].map(lambda x: x*(param_dict[\"max_value\"]-param_dict[\"min_value\"])+param_dict[\"min_value\"])\n", - " y_test[feature_name] = y_test[feature_name].map(lambda x: x*(param_dict[\"max_value\"]-param_dict[\"min_value\"])+param_dict[\"min_value\"])" + "for transformation_function in fv_transformation_functions:\n", + " udf = transformation_function.hopsworks_udf\n", + " transformation_feature = udf.transformation_features[0]\n", + " transformed_feature = udf.transformation_features[0]\n", + " if transformed_feature == \"close\" and udf.function_name == \"min_max_scaler\":\n", + " stats = udf.transformation_statistics\n", + " y_pred[transformation_feature] = y_pred[transformation_feature].map(lambda x: x*(stats.feature.max-stats.feature.min)+stats.feature.min)\n", + " y_test[transformed_feature] = y_test[transformed_feature].map(lambda x: x*(stats.feature.max-stats.feature.min)+stats.feature.min)" ] }, { @@ -996,7 +996,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.12.5" }, "vscode": { "interpreter": { diff --git a/advanced_tutorials/credit_scores/3_credit_scores_training_pipeline.ipynb b/advanced_tutorials/credit_scores/3_credit_scores_training_pipeline.ipynb index 4172e7da..ebee6b8b 100644 --- a/advanced_tutorials/credit_scores/3_credit_scores_training_pipeline.ipynb +++ b/advanced_tutorials/credit_scores/3_credit_scores_training_pipeline.ipynb @@ -261,7 +261,7 @@ "outputs": [], "source": [ "# Retrieving the names of all available transformation functions\n", - "[t_func.name for t_func in fs.get_transformation_functions()]" + "[t_func for t_func in fs.get_transformation_functions()]" ] }, { @@ -290,11 +290,11 @@ "label_encoder = fs.get_transformation_function(name='label_encoder') \n", "\n", "# Creating a dictionary of transformation functions, where each categorical column is associated with the Label Encoder\n", - "transformation_functions = {\n", - " col: label_encoder\n", + "transformation_functions = [\n", + " label_encoder(col)\n", " for col \n", " in cat_cols\n", - "}" + "]" ] }, { diff --git a/advanced_tutorials/electricity/3_electricity_training_pipeline.ipynb b/advanced_tutorials/electricity/3_electricity_training_pipeline.ipynb index 5abdf401..d607015f 100644 --- a/advanced_tutorials/electricity/3_electricity_training_pipeline.ipynb +++ b/advanced_tutorials/electricity/3_electricity_training_pipeline.ipynb @@ -163,21 +163,25 @@ "# List of price areas\n", "price_areas = [\"se1\", \"se2\", \"se3\", \"se4\"]\n", "\n", + "# Retrieving transformation functions\n", + "min_max_scaler = fs.get_transformation_function(name=\"min_max_scaler\")\n", + "label_encoder = fs.get_transformation_function(name='label_encoder')\n", + "\n", "# Mapping features to their respective transformation functions\n", - "mapping_transformers = {}\n", + "transformation_functions = []\n", "\n", "# Iterate through each price area and map features to their transformation functions\n", "for area in price_areas:\n", - " mapping_transformers[f\"price_{area}\"] = fs.get_transformation_function(name=\"min_max_scaler\")\n", - " mapping_transformers[f\"mean_temp_per_day_{area}\"] = fs.get_transformation_function(name=\"min_max_scaler\")\n", - " mapping_transformers[f\"mean_wind_speed_{area}\"] = fs.get_transformation_function(name=\"min_max_scaler\")\n", - " mapping_transformers[f\"precipitaton_amount_{area}\"] = fs.get_transformation_function(name=\"min_max_scaler\")\n", - " mapping_transformers[f\"total_sunshine_time_{area}\"] = fs.get_transformation_function(name=\"min_max_scaler\")\n", - " mapping_transformers[f\"mean_cloud_perc_{area}\"] = fs.get_transformation_function(name=\"min_max_scaler\") \n", - " mapping_transformers[f\"precipitaton_type_{area}\"] = fs.get_transformation_function(name='label_encoder')\n", + " transformation_functions.append(min_max_scaler(f\"price_{area}\"))\n", + " transformation_functions.append(min_max_scaler(f\"mean_temp_per_day_{area}\"))\n", + " transformation_functions.append(min_max_scaler(f\"mean_wind_speed_{area}\"))\n", + " transformation_functions.append(min_max_scaler(f\"precipitaton_amount_{area}\"))\n", + " transformation_functions.append(min_max_scaler(f\"total_sunshine_time_{area}\"))\n", + " transformation_functions.append(min_max_scaler(f\"mean_cloud_perc_{area}\"))\n", + " transformation_functions.append(label_encoder(f\"precipitaton_type_{area}\"))\n", "\n", "# Additional transformation for 'type_of_day'\n", - "mapping_transformers[\"type_of_day\"] = fs.get_transformation_function(name='label_encoder')" + "transformation_functions.append(label_encoder(\"type_of_day\"))" ] }, { @@ -215,7 +219,7 @@ " name='electricity_feature_view',\n", " version=1,\n", " labels=[], # you will define our 'y' later manualy\n", - " transformation_functions=mapping_transformers,\n", + " transformation_functions=transformation_functions,\n", " query=selected_features,\n", ")" ] @@ -304,9 +308,9 @@ "outputs": [], "source": [ "# Define 'y_train', 'y_val' and 'y_test'\n", - "y_train = X_train[[\"price_se1\", \"price_se2\", \"price_se3\", \"price_se4\"]]\n", - "y_val = X_val[[\"price_se1\", \"price_se2\", \"price_se3\", \"price_se4\"]]\n", - "y_test = X_test[[\"price_se1\", \"price_se2\", \"price_se3\", \"price_se4\"]]" + "y_train = X_train[[\"min_max_scaler_price_se1_\", \"min_max_scaler_price_se2_\", \"min_max_scaler_price_se3_\", \"min_max_scaler_price_se4_\"]]\n", + "y_val = X_val[[\"min_max_scaler_price_se1_\", \"min_max_scaler_price_se2_\", \"min_max_scaler_price_se3_\", \"min_max_scaler_price_se4_\"]]\n", + "y_test = X_test[[\"min_max_scaler_price_se1_\", \"min_max_scaler_price_se2_\", \"min_max_scaler_price_se3_\", \"min_max_scaler_price_se4_\"]]" ] }, { @@ -494,7 +498,7 @@ " input_width=4, \n", " label_width=4, \n", " shift=1, \n", - " label_columns=[\"price_se1\", \"price_se2\", \"price_se3\", \"price_se4\"],\n", + " label_columns=[\"min_max_scaler_price_se1_\", \"min_max_scaler_price_se2_\", \"min_max_scaler_price_se3_\", \"min_max_scaler_price_se4_\"],\n", ")\n", "\n", "# Displaying the WindowGenerator instance\n", @@ -710,7 +714,7 @@ "source": [ "# Plotting the time series data for the 'price_se4' column\n", "n_step_window.plot(\n", - " plot_col=\"price_se4\", \n", + " plot_col=\"min_max_scaler_price_se4_\", \n", " max_subplots=3, \n", " model=model.predict,\n", ")" diff --git a/advanced_tutorials/electricity/4_electricity_batch_inference.ipynb b/advanced_tutorials/electricity/4_electricity_batch_inference.ipynb index 1a2d7c94..f31dbf62 100644 --- a/advanced_tutorials/electricity/4_electricity_batch_inference.ipynb +++ b/advanced_tutorials/electricity/4_electricity_batch_inference.ipynb @@ -250,7 +250,7 @@ "feature_view.init_serving(1)\n", "\n", "# Accessing the transformation functions used in the serving configuration\n", - "td_transformation_functions = feature_view._batch_scoring_server._transformation_functions" + "fv_transformation_functions = feature_view._vector_server.model_dependent_transformation_functions" ] }, { @@ -268,15 +268,12 @@ "\n", "# Extracting and decoding the transformation functions used in serving\n", "res = {}\n", - "for feature_name in td_transformation_functions:\n", - " if feature_name in [\"price_se1\", \"price_se2\", \"price_se3\", \"price_se4\"]:\n", - " td_transformation_function = td_transformation_functions[feature_name]\n", - " sig, foobar_locals = inspect.signature(td_transformation_function.transformation_fn), locals()\n", - " param_dict = dict([(param.name, param.default) for param in sig.parameters.values() if param.default != inspect._empty])\n", - " if td_transformation_function.name == \"min_max_scaler\":\n", - " preds[feature_name] = preds[feature_name].map(\n", - " lambda x: x * (param_dict[\"max_value\"] - param_dict[\"min_value\"]) + param_dict[\"min_value\"]\n", - " )\n", + "for transformation_function in fv_transformation_functions:\n", + " udf = transformation_function.hopsworks_udf\n", + " transformed_features = udf.transformation_features[0]\n", + " if transformed_features in [\"price_se1\", \"price_se2\", \"price_se3\", \"price_se4\"] and udf.function_name == \"min_max_scaler\":\n", + " stats = udf.transformation_statistics\n", + " preds[transformed_features] = preds[transformed_features].map(lambda x: x*(stats.feature.max-stats.feature.min)+stats.feature.min)\n", "\n", "# Applying a transformation to reverse the sign of the decoded features\n", "preds = preds.apply(lambda x: -x)\n", @@ -336,7 +333,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.12.5" } }, "nbformat": 4, diff --git a/advanced_tutorials/hospital_wait_time/2_training_pipeline.ipynb b/advanced_tutorials/hospital_wait_time/2_training_pipeline.ipynb index e8583a6a..a29823e1 100644 --- a/advanced_tutorials/hospital_wait_time/2_training_pipeline.ipynb +++ b/advanced_tutorials/hospital_wait_time/2_training_pipeline.ipynb @@ -129,7 +129,7 @@ "metadata": {}, "outputs": [], "source": [ - "[f.name for f in fs.get_transformation_functions()]" + "[f for f in fs.get_transformation_functions()]" ] }, { @@ -153,11 +153,11 @@ "source": [ "features_category = ['gender', 'age_cat', 'blood_gp', 'underlying_disease', 'gestation', 'prior_transplant', 'if_transplanted']\n", "\n", - "transformation_functions_category = {\n", - " feature_name: label_encoder\n", + "transformation_functions_category = [\n", + " label_encoder(feature_name)\n", " for feature_name\n", " in features_category\n", - "}" + "]" ] }, { @@ -171,11 +171,11 @@ " 'age_at_list_registration', 'dialysis_duration', 'number_prior_transplant', 'cpra', 'hla_a1', 'hla_a2', 'hla_b1', 'hla_b2', 'hla_dr1', 'hla_dr2',\n", "]\n", "\n", - "transformation_functions_numerical = {\n", - " feature_name: standard_scaler\n", + "transformation_functions_numerical = [\n", + " standard_scaler(feature_name)\n", " for feature_name\n", " in features_numerical\n", - "}" + "]" ] }, { @@ -185,8 +185,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Join transformation_functions_category and transformation_functions_numerical dictionaries into one\n", - "transformation_functions = transformation_functions_category | transformation_functions_numerical" + "# Join transformation_functions_category and transformation_functions_numerical lists into one\n", + "transformation_functions = transformation_functions_category + transformation_functions_numerical" ] }, { diff --git a/advanced_tutorials/on_demand_feature/notebooks/3_feature_view_td_modelling.ipynb b/advanced_tutorials/on_demand_feature/notebooks/3_feature_view_td_modelling.ipynb index f66bc275..37fb952c 100644 --- a/advanced_tutorials/on_demand_feature/notebooks/3_feature_view_td_modelling.ipynb +++ b/advanced_tutorials/on_demand_feature/notebooks/3_feature_view_td_modelling.ipynb @@ -99,9 +99,9 @@ "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n", "\n", "# Map features to transformation functions.\n", - "transformation_functions = {\n", - " \"ocean_proximity\": label_encoder,\n", - "}" + "transformation_functions = [\n", + " label_encoder(\"ocean_proximity\")\n", + "]" ] }, { diff --git a/advanced_tutorials/timeseries/1_feature_backfill.ipynb b/advanced_tutorials/timeseries/1_feature_backfill.ipynb index 5a92c6c5..69c273a2 100644 --- a/advanced_tutorials/timeseries/1_feature_backfill.ipynb +++ b/advanced_tutorials/timeseries/1_feature_backfill.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "3e80dec4", + "id": "323e6bee", "metadata": {}, "source": [ "## ๐Ÿ“ Imports " @@ -11,7 +11,7 @@ { "cell_type": "code", "execution_count": null, - "id": "27d37094", + "id": "2690f222", "metadata": {}, "outputs": [], "source": [ @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "cfae4d6e", + "id": "99b76fbb", "metadata": {}, "source": [ "## โš™๏ธ Data Generation \n", @@ -39,33 +39,33 @@ { "cell_type": "code", "execution_count": null, - "id": "b7bee036", + "id": "5753f72e", "metadata": {}, "outputs": [], "source": [ - "# Define a constant START_DATE with a specific date (September 1, 2022)\n", - "START_DATE = datetime.date(2022, 9, 1)" + "# Define a constant START_DATE with a specific date (January 1, 2024)\n", + "START_DATE = datetime.date(2024, 1, 1)" ] }, { "cell_type": "code", "execution_count": null, - "id": "b610580b", + "id": "efbfce8f", "metadata": {}, "outputs": [], "source": [ "# Generate synthetic historical data using the generate_historical_data function from START_DATE till current date\n", - "data_generated = generate_historical_data(\n", - " START_DATE, # Start date for data generation (September 1, 2022)\n", + "prices_data_generated = generate_historical_data(\n", + " START_DATE, # Start date for data generation (January 1, 2024)\n", ")\n", "\n", "# Display the first 3 rows of the generated data\n", - "data_generated.head(3)" + "prices_data_generated.head(3)" ] }, { "cell_type": "markdown", - "id": "3e8bacfa", + "id": "ff860d36", "metadata": {}, "source": [ "Look at historical values for 1 and 2 IDs." @@ -74,16 +74,16 @@ { "cell_type": "code", "execution_count": null, - "id": "5e229269", + "id": "ea5bb4c9", "metadata": {}, "outputs": [], "source": [ - "plot_historical_id([1,2], data_generated)" + "plot_historical_id([1,2], prices_data_generated)" ] }, { "cell_type": "markdown", - "id": "c83e00ef", + "id": "f08be0a3", "metadata": {}, "source": [ "## ๐Ÿ‘ฎ๐Ÿปโ€โ™‚๏ธ Great Expectations " @@ -92,12 +92,12 @@ { "cell_type": "code", "execution_count": null, - "id": "4d644fc1", + "id": "b3968626", "metadata": {}, "outputs": [], "source": [ "# Convert the generated historical data DataFrame to a Great Expectations DataFrame\n", - "ge_price_df = ge.from_pandas(data_generated)\n", + "ge_price_df = ge.from_pandas(prices_data_generated)\n", "\n", "# Retrieve the expectation suite associated with the ge DataFrame\n", "expectation_suite_price = ge_price_df.get_expectation_suite()\n", @@ -109,7 +109,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21ea95d9", + "id": "c394a647", "metadata": {}, "outputs": [], "source": [ @@ -152,7 +152,47 @@ }, { "cell_type": "markdown", - "id": "6148b5e6", + "id": "102db6b7", + "metadata": {}, + "source": [ + "## โš™๏ธ Feature Engineering \n", + "\n", + "We will engineer the next features:\n", + "\n", + "- `ma_7`: This feature represents the 7-day moving average of the 'price' data, providing a smoothed representation of short-term price trends.\n", + "\n", + "- `ma_14`: This feature represents the 14-day moving average of the 'price' data, offering a slightly longer-term smoothed price trend.\n", + "\n", + "- `ma_30`: This feature represents the 30-day moving average of the 'price' data, providing a longer-term smoothed representation of price trends.\n", + "\n", + "- `daily_rate_of_change`: This feature calculates the daily rate of change in prices as a percentage change, indicating how much the price has changed from the previous day.\n", + "\n", + "- `volatility_30_day`: This feature measures the volatility of prices over a 30-day window using the standard deviation. Higher values indicate greater price fluctuations.\n", + "\n", + "- `ema_02`: This feature calculates the exponential moving average (EMA) of 'price' with a smoothing factor of 0.2, giving more weight to recent data points in the calculation.\n", + "\n", + "- `ema_05`: Similar to ema_02, this feature calculates the EMA of 'price' with a smoothing factor of 0.5, providing a different degree of responsiveness to recent data.\n", + "\n", + "- `rsi`: The Relative Strength Index (RSI) is a momentum oscillator that measures the speed and change of price movements. It ranges from 0 to 100, with values above 70 indicating overbought conditions and values below 30 indicating oversold conditions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "808d454d", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate second-order features\n", + "averages_df = calculate_second_order_features(prices_data_generated)\n", + "\n", + "# Display the first 3 rows of the resulting DataFrame\n", + "averages_df.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "b82a43e2", "metadata": {}, "source": [ "## ๐Ÿ”ฎ Connect to Hopsworks Feature Store " @@ -161,7 +201,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e8d98244", + "id": "d616f46f", "metadata": {}, "outputs": [], "source": [ @@ -174,22 +214,22 @@ }, { "cell_type": "markdown", - "id": "9471bceb", + "id": "d64c2b47", "metadata": {}, "source": [ - "## ๐Ÿช„ Feature Group Creation " + "## ๐Ÿช„ Prices Feature Group " ] }, { "cell_type": "code", "execution_count": null, - "id": "b13b302d", + "id": "82993193", "metadata": {}, "outputs": [], "source": [ "# Get or create the 'price' feature group\n", - "price_fg = fs.get_or_create_feature_group(\n", - " name='price',\n", + "prices_fg = fs.get_or_create_feature_group(\n", + " name='prices',\n", " description='Price Data',\n", " version=1,\n", " primary_key=['id'],\n", @@ -198,81 +238,178 @@ " expectation_suite=expectation_suite_price,\n", ") \n", "# Insert data\n", - "price_fg.insert(data_generated)" + "prices_fg.insert(prices_data_generated)\n", + "print('โœ… Done!')" ] }, { "cell_type": "markdown", - "id": "b7ad450b", + "id": "5f3d5ecd", "metadata": {}, "source": [ - "## โš™๏ธ Feature Engineering \n", - "\n", - "We will engineer the next features:\n", - "\n", - "- `ma_7`: This feature represents the 7-day moving average of the 'price' data, providing a smoothed representation of short-term price trends.\n", - "\n", - "- `ma_14`: This feature represents the 14-day moving average of the 'price' data, offering a slightly longer-term smoothed price trend.\n", - "\n", - "- `ma_30`: This feature represents the 30-day moving average of the 'price' data, providing a longer-term smoothed representation of price trends.\n", - "\n", - "- `daily_rate_of_change`: This feature calculates the daily rate of change in prices as a percentage change, indicating how much the price has changed from the previous day.\n", - "\n", - "- `volatility_30_day`: This feature measures the volatility of prices over a 30-day window using the standard deviation. Higher values indicate greater price fluctuations.\n", - "\n", - "- `ema_02`: This feature calculates the exponential moving average (EMA) of 'price' with a smoothing factor of 0.2, giving more weight to recent data points in the calculation.\n", - "\n", - "- `ema_05`: Similar to ema_02, this feature calculates the EMA of 'price' with a smoothing factor of 0.5, providing a different degree of responsiveness to recent data.\n", - "\n", - "- `rsi`: The Relative Strength Index (RSI) is a momentum oscillator that measures the speed and change of price movements. It ranges from 0 to 100, with values above 70 indicating overbought conditions and values below 30 indicating oversold conditions." + "## ๐Ÿช„ Averages Feature Group " ] }, { "cell_type": "code", "execution_count": null, - "id": "2b3f9bbe", + "id": "921e2d25", "metadata": {}, "outputs": [], "source": [ - "# Calculate second-order features\n", - "averages_df = calculate_second_order_features(data_generated)\n", - "\n", - "# Display the first 3 rows of the resulting DataFrame\n", - "averages_df.head(3)" + "# Get or create the 'averages' feature group\n", + "averages_fg = fs.get_or_create_feature_group(\n", + " name='averages',\n", + " description='Calculated second order features',\n", + " version=1,\n", + " primary_key=['id'],\n", + " event_time='date',\n", + " online_enabled=True,\n", + " parents=[prices_fg],\n", + ")\n", + "# Insert data\n", + "averages_fg.insert(averages_df, wait=True)" ] }, { "cell_type": "markdown", - "id": "b8496432", + "id": "ef27a716", "metadata": {}, "source": [ - "## ๐Ÿช„ Feature Group Creation " + "## ๐Ÿ•ต๐Ÿปโ€โ™‚๏ธ Feature monitoring\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "e4c0dbab", + "id": "0c94afe5", "metadata": {}, "outputs": [], "source": [ - "# Get or create the 'averages' feature group\n", - "averages_fg = fs.get_or_create_feature_group(\n", - " name='averages',\n", - " description='Calculated second order features',\n", - " version=1,\n", - " primary_key=['id'],\n", - " event_time='date',\n", - " online_enabled=True,\n", - " parents=[price_fg],\n", - ")\n", - "# Insert data\n", - "averages_fg.insert(averages_df)" + "# Raw Price Monitoring\n", + "price_basic_monitoring = prices_fg.create_feature_monitoring(\n", + " name=\"price_basic_monitoring\",\n", + " feature_name=\"price\",\n", + " description=\"Monitor daily price statistics and detect sudden changes\",\n", + " cron_expression=\"0 0 0 * * ? *\" # Daily at midnight (second, minute, hour, day, month, day-of-week, year)\n", + ").with_detection_window(\n", + " time_offset=\"1d\",\n", + " row_percentage=1.0\n", + ").with_reference_window(\n", + " time_offset=\"2d\",\n", + " window_length=\"1d\",\n", + " row_percentage=1.0\n", + ").compare_on(\n", + " metric=\"mean\",\n", + " threshold=0.05\n", + ").save()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b84a3a3c", + "metadata": {}, + "outputs": [], + "source": [ + "# Moving Average Cross Monitoring\n", + "ma_cross_monitoring = averages_fg.create_feature_monitoring(\n", + " name=\"ma_cross_monitoring\",\n", + " description=\"Monitor crossovers between short and long-term moving averages\",\n", + " feature_name=\"ma_7\",\n", + " cron_expression=\"0 0 0 * * ? *\" # Daily at midnight\n", + ").with_detection_window(\n", + " time_offset=\"1d\",\n", + " row_percentage=1.0\n", + ").with_reference_window(\n", + " time_offset=\"31d\",\n", + " window_length=\"30d\",\n", + " row_percentage=1.0,\n", + ").compare_on(\n", + " metric=\"mean\",\n", + " threshold=0.02\n", + ").save()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8053c37", + "metadata": {}, + "outputs": [], + "source": [ + "# Volatility Monitoring\n", + "volatility_monitoring = averages_fg.create_feature_monitoring(\n", + " name=\"volatility_monitoring\",\n", + " feature_name=\"volatility_30_day\",\n", + " description=\"Monitor significant changes in 30-day volatility\",\n", + " cron_expression=\"0 0 0 * * ? *\" # Daily at midnight\n", + ").with_detection_window(\n", + " time_offset=\"1d\",\n", + " row_percentage=1.0\n", + ").with_reference_window(\n", + " time_offset=\"8d\",\n", + " window_length=\"7d\",\n", + " row_percentage=1.0\n", + ").compare_on(\n", + " metric=\"mean\",\n", + " threshold=0.15\n", + ").save()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d44fa963", + "metadata": {}, + "outputs": [], + "source": [ + "# RSI Extreme Monitoring\n", + "rsi_monitoring = averages_fg.create_feature_monitoring(\n", + " name=\"rsi_monitoring\",\n", + " feature_name=\"rsi\",\n", + " description=\"Monitor RSI for overbought/oversold conditions\",\n", + " cron_expression=\"0 0 0 * * ? *\" # Daily at midnight\n", + ").with_detection_window(\n", + " time_offset=\"1d\",\n", + " row_percentage=1.0\n", + ").with_reference_value(\n", + " value=50\n", + ").compare_on(\n", + " metric=\"mean\",\n", + " threshold=20\n", + ").save()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9974daa9", + "metadata": {}, + "outputs": [], + "source": [ + "# Rate of Change Monitoring using MEAN\n", + "roc_monitoring = averages_fg.create_feature_monitoring(\n", + " name=\"rate_of_change_monitoring\",\n", + " feature_name=\"daily_rate_of_change\",\n", + " description=\"Monitor for abnormal average price changes\",\n", + " cron_expression=\"0 0 0 * * ? *\" # Daily at midnight\n", + ").with_detection_window(\n", + " time_offset=\"1d\",\n", + " row_percentage=1.0\n", + ").with_reference_window(\n", + " time_offset=\"8d\",\n", + " window_length=\"7d\",\n", + " row_percentage=1.0\n", + ").compare_on(\n", + " metric=\"mean\", \n", + " threshold=1.0 # Threshold for mean daily rate of change (1% average change)\n", + ").save()" ] }, { "cell_type": "markdown", - "id": "6fdf0327", + "id": "5f5b2472", "metadata": {}, "source": [ "---" @@ -281,7 +418,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -295,7 +432,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/timeseries/1a_price_feature_pipeline.py b/advanced_tutorials/timeseries/1a_price_feature_pipeline.py index 7bc513eb..65d6725b 100644 --- a/advanced_tutorials/timeseries/1a_price_feature_pipeline.py +++ b/advanced_tutorials/timeseries/1a_price_feature_pipeline.py @@ -12,7 +12,7 @@ # Retrieve Price Feature Group price_fg = fs.get_feature_group( - name='price', + name='prices', version=1, ) # Insert generated data for today into Price Feature Group diff --git a/advanced_tutorials/timeseries/1b_averages_feature_pipeline.py b/advanced_tutorials/timeseries/1b_averages_feature_pipeline.py index cb35806a..331dc408 100644 --- a/advanced_tutorials/timeseries/1b_averages_feature_pipeline.py +++ b/advanced_tutorials/timeseries/1b_averages_feature_pipeline.py @@ -6,32 +6,33 @@ # Connect to the Feature Store project = hopsworks.login() -fs = project.get_feature_store() +fs = project.get_feature_store() -# Retrieve Averages Feature Group +# Retrieve Feature Groups averages_fg = fs.get_feature_group( - name='averages', + name='averages', version=1, ) -# Retrieve Price Feature Group price_fg = fs.get_feature_group( - name='price', + name='prices', version=1, ) -# Get today's date -today = datetime.today() -# Calculate the date 30 days ago -thirty_days_ago = (today - timedelta(days=31)).strftime("%Y-%m-%d") +# Get today's date and 30 days ago as timestamps +today = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) +thirty_days_ago = today - timedelta(days=31) -# Read price data for 30 days ago +# Read price data using timestamp month_price_data = price_fg.filter(price_fg.date >= thirty_days_ago).read() # Calculate second order features averages_df = calculate_second_order_features(month_price_data) -# Get calculated second order features only for today -averages_today = averages_df[averages_df.date == today.strftime("%Y-%m-%d")] +# Convert today's date to string format for filtering +today_str = today.strftime("%Y-%m-%d") -# Insert second order features for today into Averages Feature Group +# Filter for today's data +averages_today = averages_df[averages_df['date'].dt.strftime("%Y-%m-%d") == today_str] + +# Insert second order features for today averages_fg.insert(averages_today) diff --git a/advanced_tutorials/timeseries/2_training_pipeline.ipynb b/advanced_tutorials/timeseries/2_training_pipeline.ipynb index daecc690..c2417c49 100644 --- a/advanced_tutorials/timeseries/2_training_pipeline.ipynb +++ b/advanced_tutorials/timeseries/2_training_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "83dd6e7e", + "id": "498c0916", "metadata": {}, "source": [ "## ๐Ÿ“ Imports " @@ -11,7 +11,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5c5b0a68", + "id": "5cfe0245", "metadata": {}, "outputs": [], "source": [ @@ -21,7 +21,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8355be0e", + "id": "bfc0071e", "metadata": {}, "outputs": [], "source": [ @@ -40,7 +40,7 @@ }, { "cell_type": "markdown", - "id": "9dbea0ad", + "id": "95952e61", "metadata": {}, "source": [ "## ๐Ÿ”ฎ Connect to Hopsworks Feature Store " @@ -49,7 +49,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e499f528", + "id": "6f24908b", "metadata": {}, "outputs": [], "source": [ @@ -63,7 +63,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b9c6c942", + "id": "c94da3e0", "metadata": {}, "outputs": [], "source": [ @@ -73,15 +73,15 @@ " version=1,\n", ")\n", "\n", - "price_fg = fs.get_feature_group(\n", - " name='price',\n", + "prices_fg = fs.get_feature_group(\n", + " name='prices',\n", " version=1,\n", ")" ] }, { "cell_type": "markdown", - "id": "7e72db48", + "id": "4ec81684", "metadata": {}, "source": [ "## ๐Ÿ”ช Feature Selection " @@ -90,13 +90,13 @@ { "cell_type": "code", "execution_count": null, - "id": "0740ac7f", + "id": "acdab503", "metadata": {}, "outputs": [], "source": [ "# Select features for training data\n", - "selected_features = price_fg.select_all() \\\n", - " .join(averages_fg.select_except(['date']))\n", + "selected_features = prices_fg.select_all() \\\n", + " .join(averages_fg.select_features())\n", "\n", "# Uncomment this if you would like to view your selected features\n", "# selected_features.show(5)" @@ -104,7 +104,7 @@ }, { "cell_type": "markdown", - "id": "1334a4cb", + "id": "8f229b63", "metadata": {}, "source": [ "## ๐Ÿค– Transformation Functions " @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bccb7d00", + "id": "8f17a7fd", "metadata": {}, "outputs": [], "source": [ @@ -126,16 +126,16 @@ "]\n", "\n", "# Map features to transformations\n", - "transformation_functions = {\n", - " feature_name: min_max_scaler\n", + "transformation_functions = [\n", + " min_max_scaler(feature_name)\n", " for feature_name in feature_names\n", - "}\n", + "]\n", "transformation_functions" ] }, { "cell_type": "markdown", - "id": "925ad679", + "id": "4f7a0ddc", "metadata": {}, "source": [ "## โš™๏ธ Feature View Creation " @@ -144,7 +144,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ffdfda5e", + "id": "5530d33d", "metadata": {}, "outputs": [], "source": [ @@ -160,7 +160,7 @@ }, { "cell_type": "markdown", - "id": "3e1e1170", + "id": "9b7b990d", "metadata": {}, "source": [ "## ๐Ÿ‹๏ธ Training Dataset Creation " @@ -169,16 +169,16 @@ { "cell_type": "code", "execution_count": null, - "id": "554cbace", + "id": "b29122da", "metadata": {}, "outputs": [], "source": [ "# Get training and testing sets\n", "X_train, X_test, y_train, y_test = feature_view.train_test_split(\n", - " description='Prices Dataset', # Provide a description for the dataset split\n", - " train_start='2022-09-01', # Start date for the training set\n", - " train_end='2023-07-01', # End date for the training set\n", - " test_start='2023-07-01', # Start date for the testing set\n", + " description='Prices Dataset', # Provide a description for the dataset split\n", + " train_start='2024-01-01', # Start date for the training set\n", + " train_end='2024-08-31', # End date for the training set\n", + " test_start='2024-09-01', # Start date for the testing set\n", " test_end=datetime.today().strftime(\"%Y-%m-%d\"), # End date for the testing set (current date)\n", ")" ] @@ -186,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "041666ce", + "id": "37f8dee1", "metadata": {}, "outputs": [], "source": [ @@ -196,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d7315f14", + "id": "c5ccc97b", "metadata": {}, "outputs": [], "source": [ @@ -206,7 +206,7 @@ { "cell_type": "code", "execution_count": null, - "id": "af817399", + "id": "623b86dd", "metadata": {}, "outputs": [], "source": [ @@ -229,7 +229,7 @@ }, { "cell_type": "markdown", - "id": "4e7d0253", + "id": "c5816d56", "metadata": {}, "source": [ "## ๐Ÿงฌ Modeling \n", @@ -240,7 +240,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a999d119", + "id": "08cd06f5", "metadata": {}, "outputs": [], "source": [ @@ -255,13 +255,13 @@ "\n", "# Calculate RMSE on the validation set\n", "mse = mean_squared_error(y_test, y_test_pred, squared=False)\n", - "print(f\"Mean Squared Error (MSE): {mse}\")" + "print(f\"๐ŸŽฏ Mean Squared Error (MSE): {mse}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "3b0f556d", + "id": "66d5920b", "metadata": {}, "outputs": [], "source": [ @@ -290,7 +290,7 @@ }, { "cell_type": "markdown", - "id": "558799fa", + "id": "57c88024", "metadata": {}, "source": [ "## โš™๏ธ Model Schema " @@ -299,7 +299,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3b490e4d", + "id": "9e1c5410", "metadata": {}, "outputs": [], "source": [ @@ -324,7 +324,7 @@ }, { "cell_type": "markdown", - "id": "44e8f511", + "id": "ed7546cf", "metadata": {}, "source": [ "## ๐Ÿ“ Register model " @@ -333,7 +333,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fb397a64", + "id": "a2ff2f0a", "metadata": {}, "outputs": [], "source": [ @@ -347,14 +347,19 @@ "# Save the trained XGBoost model using joblib\n", "joblib.dump(model, f'{model_dir}/xgboost_price_model.pkl')\n", "\n", + "# Check if the images directory exists, and create it if not\n", + "images_dir = f\"{model_dir}/images\"\n", + "if not os.path.isdir(images_dir):\n", + " os.mkdir(images_dir)\n", + "\n", "# Write the generated Plotly figure image to the specified directory\n", - "fig.write_image(f'{model_dir}/model_prediction.png')" + "fig.write_image(f'{images_dir}/model_prediction.png')" ] }, { "cell_type": "code", "execution_count": null, - "id": "782f9731", + "id": "dfe0b9c7", "metadata": {}, "outputs": [], "source": [ @@ -368,6 +373,7 @@ " model_schema=model_schema, # Provide the model schema\n", " input_example=X_train.sample(), # Provide an example of the input data\n", " description=\"Price Predictor\", # Add a description for the model\n", + " feature_view=feature_view, # Add a feature view to the model\n", ")\n", "\n", "# Save the model to the specified directory\n", @@ -376,7 +382,7 @@ }, { "cell_type": "markdown", - "id": "88bfb99f", + "id": "bdb89574", "metadata": {}, "source": [ "## ๐Ÿš€ Model Deployment\n", @@ -390,7 +396,7 @@ }, { "cell_type": "markdown", - "id": "4017c8e4", + "id": "d3fadeb6", "metadata": {}, "source": [ "## ๐Ÿ“Ž Predictor script for Python models\n", @@ -405,7 +411,7 @@ { "cell_type": "code", "execution_count": null, - "id": "79a97ca9", + "id": "74090322", "metadata": {}, "outputs": [], "source": [ @@ -413,7 +419,7 @@ "import os\n", "import numpy as np\n", "import pandas as pd\n", - "import hsfs\n", + "import hopsworks\n", "import joblib\n", "\n", "\n", @@ -421,33 +427,34 @@ "\n", " def __init__(self):\n", " \"\"\" Initializes the serving state, reads a trained model\"\"\" \n", - " # get feature store handle\n", - " fs_conn = hsfs.connection()\n", - " self.fs = fs_conn.get_feature_store()\n", + " # Get feature store handle\n", + " project = hopsworks.login()\n", + " mr = project.get_model_registry()\n", " \n", - " # get feature view\n", - " self.fv = self.fs.get_feature_view(\"price_fv\", 1)\n", - " \n", - " # initialize serving\n", - " self.fv.init_serving(1)\n", + " # Retrieve the feature view from the model\n", + " retrieved_model = mr.get_model(\n", + " name=\"xgboost_price_model\",\n", + " version=1,\n", + " )\n", + " self.feature_view = retrieved_model.get_feature_view()\n", "\n", - " # load the trained model\n", + " # Load the trained model\n", " self.model = joblib.load(os.environ[\"ARTIFACT_FILES_PATH\"] + \"/xgboost_price_model.pkl\")\n", - " print(\"Initialization Complete\")\n", + " print(\"โœ… Initialization Complete\")\n", "\n", " \n", " def predict(self, id_value):\n", " \"\"\" Serves a prediction request usign a trained model\"\"\"\n", " # Retrieve feature vectors\n", - " feature_vector = self.fv.get_feature_vector(\n", - " entry = {'id': id_value[0]}\n", + " feature_vector = self.feature_view.get_feature_vector(\n", + " entry = {'id': id_value[0][0]}\n", " )\n", " return self.model.predict(np.asarray(feature_vector[1:]).reshape(1, -1)).tolist()" ] }, { "cell_type": "markdown", - "id": "b979582c", + "id": "f21d7e08", "metadata": {}, "source": [ "This script needs to be put into a known location in the Hopsworks file system. Let's call the file predict_example.py and put it in the Models directory." @@ -456,7 +463,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a64048e9", + "id": "a2ba0131", "metadata": {}, "outputs": [], "source": [ @@ -472,7 +479,7 @@ }, { "cell_type": "markdown", - "id": "7d7cea10", + "id": "bf50feb9", "metadata": {}, "source": [ "---" @@ -480,7 +487,7 @@ }, { "cell_type": "markdown", - "id": "cf59ec95", + "id": "71b4628b", "metadata": {}, "source": [ "## ๐Ÿš€ Create the deployment\n", @@ -491,7 +498,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9f98e2dd", + "id": "e488c0d8", "metadata": {}, "outputs": [], "source": [ @@ -505,7 +512,7 @@ { "cell_type": "code", "execution_count": null, - "id": "86154683", + "id": "e0ec51ee", "metadata": {}, "outputs": [], "source": [ @@ -516,7 +523,7 @@ { "cell_type": "code", "execution_count": null, - "id": "80e5b6ec", + "id": "a97b5a4a", "metadata": {}, "outputs": [], "source": [ @@ -527,17 +534,17 @@ { "cell_type": "code", "execution_count": null, - "id": "ea906d47", + "id": "cf1d59a2", "metadata": {}, "outputs": [], "source": [ "# Predict price for the 1 ID\n", - "deployment.predict({'instances': [1]})" + "deployment.predict(inputs=[[1]])" ] }, { "cell_type": "markdown", - "id": "1354c27a", + "id": "305557e3", "metadata": {}, "source": [ "---" @@ -546,7 +553,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -560,7 +567,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/advanced_tutorials/timeseries/3_batch_inference_pipeline.py b/advanced_tutorials/timeseries/3_batch_inference_pipeline.py index e6691d01..36251efe 100644 --- a/advanced_tutorials/timeseries/3_batch_inference_pipeline.py +++ b/advanced_tutorials/timeseries/3_batch_inference_pipeline.py @@ -22,7 +22,7 @@ def get_feature_store(): def get_feature_group(): st.write("๐Ÿช Retrieving the Price Feature Group...") price_fg = fs.get_feature_group( - name='price', + name='prices', version=1, ) st.write("โœ… Success!") @@ -33,7 +33,7 @@ def get_feature_group(): def get_feature_view(): st.write("๐Ÿช Retrieving the Feature View...") feature_view = fs.get_feature_view( - name = 'price_fv3', + name = 'price_fv', version = 1 ) st.write("โœ… Success!") @@ -66,11 +66,11 @@ def retrieve_model(): st.write("โš™๏ธ Retrieving Model from Model Registry...") mr = project.get_model_registry() retrieved_model = mr.get_model( - name="xgboost_price_model2", + name="xgboost_price_model", version=1, ) saved_model_dir = retrieved_model.download() - model = joblib.load(saved_model_dir + "/xgboost_price_model2.pkl") + model = joblib.load(saved_model_dir + "/xgboost_price_model.pkl") st.write("โœ… Success!") diff --git a/advanced_tutorials/timeseries/3_online_inference_pipeline.py b/advanced_tutorials/timeseries/3_online_inference_pipeline.py index da1c3ea9..d9ac87be 100644 --- a/advanced_tutorials/timeseries/3_online_inference_pipeline.py +++ b/advanced_tutorials/timeseries/3_online_inference_pipeline.py @@ -14,7 +14,7 @@ def get_deployment(): ms = project.get_model_serving() # Get deployment - deployment = ms.get_deployment("priceonlinemodeldeployment3") + deployment = ms.get_deployment("priceonlinemodeldeployment") # Start deployment deployment.start(await_running=180) @@ -33,7 +33,7 @@ def get_deployment(): st.write('You selected the next ID:', options) -preds = [deployment.predict({'instances': [option]}) for option in options] +preds = [deployment.predict({'instances': [[option]]}) for option in options] for option, pred in zip(options, preds): st.write(f'๐Ÿ”ฎ Predicted Price for the {option} ID: {round(pred["predictions"][0],2)}๐Ÿ’ฐ') \ No newline at end of file diff --git a/advanced_tutorials/timeseries/features/averages.py b/advanced_tutorials/timeseries/features/averages.py index c349b156..f063865e 100644 --- a/advanced_tutorials/timeseries/features/averages.py +++ b/advanced_tutorials/timeseries/features/averages.py @@ -11,13 +11,12 @@ def calculate_second_order_features(df: pd.DataFrame) -> pd.DataFrame: Returns: - pd.DataFrame: DataFrame with second-order features added for each unique ID. """ - # Convert the 'date' column to a datetime object + # Ensure date column is datetime df['date'] = pd.to_datetime(df['date']) - # Sort the DataFrame by 'date' + # Sort the DataFrame by 'id' and 'date' df = df.sort_values(by=['id', 'date']) - # Create a function to calculate features for each group def calculate_features(group: pd.DataFrame) -> pd.DataFrame: # Calculate moving averages for 7 days, 14 days, and 30 days group['ma_7'] = group['price'].rolling(window=7).mean() @@ -25,7 +24,7 @@ def calculate_features(group: pd.DataFrame) -> pd.DataFrame: group['ma_30'] = group['price'].rolling(window=30).mean() # Calculate the daily rate of change in prices - group['daily_rate_of_change'] = group['price'].pct_change() * 100 # Calculate as a percentage change + group['daily_rate_of_change'] = group['price'].pct_change() * 100 # Calculate the volatility using standard deviation for a 30-day window group['volatility_30_day'] = group['price'].rolling(window=30).std() @@ -43,9 +42,12 @@ def calculate_features(group: pd.DataFrame) -> pd.DataFrame: return group # Apply the calculate_features function to each ID group - df = df.groupby('id').apply(calculate_features) + df = df.groupby('id', group_keys=False).apply(calculate_features) # Drop the original 'price' column df.drop('price', axis=1, inplace=True) + # Ensure date is in timestamp format + df['date'] = pd.to_datetime(df['date']) + return df diff --git a/advanced_tutorials/timeseries/features/price.py b/advanced_tutorials/timeseries/features/price.py index ad372ef8..9cd2df8c 100644 --- a/advanced_tutorials/timeseries/features/price.py +++ b/advanced_tutorials/timeseries/features/price.py @@ -8,7 +8,7 @@ import plotly.colors as pc from typing import List, Union, Optional, Tuple, Dict -def generate_historical_day(date: date, start_date: date, data_list: List[Tuple[date, int, float]]) -> List[Tuple[date, int, float]]: +def generate_historical_day(date: date, start_date: date, data_list: List[Tuple[datetime.datetime, int, float]]) -> List[Tuple[datetime.datetime, int, float]]: """ Generates synthetic data for a given day with different price patterns for each ID. @@ -35,13 +35,16 @@ def generate_historical_day(date: date, start_date: date, data_list: List[Tuple[ # Generate a range of prices based on the calculated variations prices = np.linspace(price_base - price_variation, price_base + price_variation, num_entries) + # Convert date to datetime at midnight for timestamp compatibility + datetime_val = datetime.datetime.combine(date, datetime.time()) + for _ in range(num_entries): # Randomly select an ID from the list of IDs selected_id = np.random.choice(ids) # Ensure non-negative prices price = max(prices[_], 0) # Append the generated data entry to the data list - data_list.append((date, selected_id, round(price, 1))) + data_list.append((datetime_val, selected_id, round(price, 1))) return data_list @@ -71,6 +74,9 @@ def generate_historical_data(start_date: Optional[date] = None, end_date: Option df = pd.DataFrame(data_list, columns=['date', 'id', 'price']) + # Ensure date column is datetime type for timestamp compatibility + df['date'] = pd.to_datetime(df['date']) + df.drop_duplicates(inplace=True) return df @@ -87,7 +93,9 @@ def generate_today() -> pd.DataFrame: num_entries = 5000 # 5000 rows per day ids = np.arange(5001) # IDs from 0 to 5000 data_list = [] - date = datetime.date.today() + + # Use datetime instead of date for timestamp compatibility + current_datetime = datetime.datetime.combine(datetime.date.today(), datetime.time()) prices = ( 200 + np.random.uniform(-50, 50, num_entries) @@ -95,10 +103,13 @@ def generate_today() -> pd.DataFrame: for entry in range(num_entries): selected_id = np.random.choice(ids) - data_list.append((date, selected_id, round(prices[entry], 1))) + data_list.append((current_datetime, selected_id, round(prices[entry], 1))) df = pd.DataFrame(data_list, columns=['date', 'id', 'price']) + # Ensure date column is datetime type for timestamp compatibility + df['date'] = pd.to_datetime(df['date']) + df.drop_duplicates(inplace=True) return df @@ -115,7 +126,7 @@ def to_wide_format(data: pd.DataFrame) -> pd.DataFrame: - pd.DataFrame: A DataFrame in wide format with 'date' as the index, 'id' as columns, and 'price' values. """ # Convert the 'date' column to datetime type - data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d') + data['date'] = pd.to_datetime(data['date']) # Aggregate duplicate entries by taking the mean of prices agg_df = data.groupby(['date', 'id'])['price'].mean().reset_index() diff --git a/advanced_tutorials/transformation_functions/custom/0_creating_custom_transformation_functions.ipynb b/advanced_tutorials/transformation_functions/custom/0_creating_custom_transformation_functions.ipynb new file mode 100644 index 00000000..2d03e19d --- /dev/null +++ b/advanced_tutorials/transformation_functions/custom/0_creating_custom_transformation_functions.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d6fb35cf", + "metadata": {}, + "source": [ + "# ๐Ÿ‘จ๐Ÿปโ€๐Ÿซ Custom Transformation Functions\n", + "\n", + "In this tutorial you will learn how to create custom transformation functions in hopsworks feature store." + ] + }, + { + "cell_type": "markdown", + "id": "a19fd127", + "metadata": {}, + "source": [ + "## ๐Ÿ—„๏ธ Table of Contents\n", + "- [๐Ÿ“ Imports](#1)\n", + "- [๐Ÿ”ฎ Connecting to Hopsworks Feature Store](#2)\n", + "- [๐Ÿ‘ฉ๐Ÿปโ€๐Ÿ”ฌ Creation of Custom Transformation Functions](#3)\n", + "- [โœ”๏ธ Testing Custom Transformation Functions in Hopsworks](#4)- \n", + "- [โœ๐Ÿป Registering Custom Transformation Functions in Hopsworks](#4)" + ] + }, + { + "cell_type": "markdown", + "id": "3cc6a7e9", + "metadata": {}, + "source": [ + "\n", + "# ๐Ÿ“ Imports " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22c08e9f", + "metadata": {}, + "outputs": [], + "source": [ + "# Importing necessary libraries\n", + "import pandas as pd # For data manipulation and analysis using DataFrames\n", + "import numpy as np # For numerical computations and arrays\n", + "import os # For operating system-related functions\n", + "import joblib # For saving and loading model files\n", + "\n", + "import xgboost as xgb # For using the XGBoost machine learning library\n", + "from sklearn.metrics import accuracy_score # For evaluating model accuracy" + ] + }, + { + "cell_type": "markdown", + "id": "b2f4b822", + "metadata": {}, + "source": [ + "\n", + "# ๐Ÿ”ฎ Connecting to Hopsworks Feature Store \n", + "\n", + "The next step is to login to the Hopsworks platform. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49800275", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store() " + ] + }, + { + "cell_type": "markdown", + "id": "63b3bab5-e4ba-4c75-bf63-8f72137bd33e", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# ๐Ÿ‘ฉ๐Ÿปโ€๐Ÿ”ฌ Creation of Custom Transformation Functions\n", + "\n", + "In Hopsworks, custom transformation functions can be defined using the `@hopsworks.udf` decorator. These transformation functions are implemented as Pandas UDFs, allowing efficient processing of large datasets. Hopsworks provides support for various types of transformations. Hopsworks also allows you to access training dataset statistics for any of the feature provided as input to the UDF. For more details, you can refer to the official documentation [here](https://docs.hopsworks.ai/latest/user_guides/fs/transformation_functions/).\n", + "\n", + "Below are two examples of User-Defined Functions (UDFs): add_one and scaler.\n", + "\n", + "The add_one function is a basic transformation that takes a feature as input and increments its value by one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba0042ec-cca5-422d-8aea-120422b1fb57", + "metadata": {}, + "outputs": [], + "source": [ + "@hopsworks.udf(return_type=int, drop=[\"feature\"])\n", + "def add_one(feature: pd.Series) -> pd.Series:\n", + " return feature + 1" + ] + }, + { + "cell_type": "markdown", + "id": "1d435446-2850-44b9-b226-40a5c35b63f4", + "metadata": {}, + "source": [ + "The `scaler` function takes a feature as input, along with its associated statistics, and scales the values to a range between 0 and 1. It then returns the transformed feature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc7d9620-40e0-4826-aae2-4b50412e4491", + "metadata": {}, + "outputs": [], + "source": [ + "from hopsworks.hsfs.transformation_statistics import TransformationStatistics\n", + "\n", + "@hopsworks.udf(return_type=float, drop=[\"feature\"])\n", + "def scaler(feature: pd.Series, statistics=TransformationStatistics(\"feature\")) -> pd.Series:\n", + " return (feature - statistics.feature.min) / (statistics.feature.max - statistics.feature.min)" + ] + }, + { + "cell_type": "markdown", + "id": "c235d7bb-5f27-4a85-8bd4-19a4b51bb941", + "metadata": {}, + "source": [ + "\n", + "## โœ”๏ธ Testing of Custom Transformation Functions" + ] + }, + { + "cell_type": "markdown", + "id": "dbc8559e-953d-4d4c-9ba1-a9ad76489635", + "metadata": {}, + "source": [ + "Once a UDF is defined, it should be thoroughly tested to ensure it works as intended.\n", + "\n", + "In Hopsworks, to test a UDF, its `output_column_names` property must be set. Afterward, the executable function can be retrieved using the `get_udf` method.\n", + "\n", + "The `output_column_names` attribute needs to be manually set, as it is typically generated when the UDF is attached to a feature group or feature view. Once this is configured, the UDF can be tested by retrieving the executable function with `get_udf` and calling it using a Pandas Series as input." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f0f9c5a-7a11-4adb-90d8-745bd6452855", + "metadata": {}, + "outputs": [], + "source": [ + "# Assign output column names \n", + "add_one.output_column_names = [\"scaler_feature\"]\n", + "\n", + "# Get the excutable UDF based on the transformation statistics\n", + "udf = add_one.get_udf()\n", + "\n", + "# Create testing Series\n", + "feature = pd.Series([0, 5, 10])\n", + "\n", + "print(\"โ›ณ๏ธ The incremented are:\", udf(feature).values.tolist())" + ] + }, + { + "cell_type": "markdown", + "id": "11b85d73-c8c9-435c-ae83-f8b168105537", + "metadata": {}, + "source": [ + "The `scaler` UDF relies on the statistics of the training dataset. Therefore, to test it, the transformation_statistics attribute must be set using an instance of the `FeatureDescriptiveStatistics` object, which contains the necessary test values for the statistics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a3ba859-74f0-46aa-8fcc-72b404e28070", + "metadata": {}, + "outputs": [], + "source": [ + "from hopsworks.hsfs.statistics import FeatureDescriptiveStatistics\n", + "\n", + "# Assign test statistics since the UDF uses statistics\n", + "statistics = [FeatureDescriptiveStatistics(feature_name=\"feature\", min=0, max=10)]\n", + "scaler.transformation_statistics = statistics\n", + "\n", + "# Assign output column names \n", + "scaler.output_column_names = [\"scaler_feature\"]\n", + "\n", + "# Get the excutable UDF based on the transformation statistics\n", + "udf = scaler.get_udf()\n", + "\n", + "# Get testing Series\n", + "feature = pd.Series([0, 5, 10])\n", + "\n", + "print(\"โ›ณ๏ธ The Scaled Values are:\", udf(feature).values.tolist())" + ] + }, + { + "cell_type": "markdown", + "id": "4d97d0ce-0903-4bab-857d-0231501a0f93", + "metadata": {}, + "source": [ + "Once a custom transformation function or UDF is defined, it can be used as an [On-Demand transformations](https://docs.hopsworks.ai/latest/user_guides/fs/feature_group/on_demand_transformations/) by attaching the function to a Feature Group, or as a [Model-Dependent transformations](https://docs.hopsworks.ai/latest/user_guides/fs/feature_view/model-dependent-transformations/) by linking it to a Feature View.\n", + "\n", + "Additionally, UDFs can be saved in the Hopsworks Feature Store, allowing them to be retrieved and reused in the future." + ] + }, + { + "cell_type": "markdown", + "id": "1399515f-a0b5-4ed5-b6cd-e912e23680e4", + "metadata": {}, + "source": [ + "\n", + "## โœ๐Ÿป Saving Custom Transformation Functions in Hopsworks\n", + "\n", + "Transformation functions can be saved in Hopsworks, allowing them to be retrieved and used later.\n", + "\n", + "To create a transformation function, use the .create_transformation_function() method with the following parameters:\n", + "\n", + "- `transformation_function`: Your custom transformation function/UDF.\n", + "- `version`: The version of your custom transformation function.\n", + "\n", + "Donโ€™t forget to use the .save() method to persist the transformation function in the backend.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9999454-ff67-4690-a421-e099923bd9b9", + "metadata": {}, + "outputs": [], + "source": [ + "scaler = fs.create_transformation_function(\n", + " scaler, \n", + " version=1,\n", + " )\n", + "scaler.save()" + ] + }, + { + "cell_type": "markdown", + "id": "47ba7ce6-0c0a-4e84-9a38-2fde3f4b0826", + "metadata": {}, + "source": [ + "Now let's check if the custom transformation functions is present in the feature store. You can be the function `get_transformation_functions` for this." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a8f70ea-97c9-417b-94ec-4cb7f23ce0ad", + "metadata": {}, + "outputs": [], + "source": [ + "# Check it your transformation functions are present in the feature store\n", + "[tf for tf in fs.get_transformation_functions()]" + ] + }, + { + "cell_type": "markdown", + "id": "beeeddad-b3b5-45c3-acbe-b2895de1fcf2", + "metadata": {}, + "source": [ + "A transformation function saved in Hopworks can be retrived by using the function `get_transformation_function`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96511d72-eee5-40db-a3ee-104653669a0f", + "metadata": {}, + "outputs": [], + "source": [ + "scaler = fs.get_transformation_function(name=\"scaler\", version=1)\n", + "scaler" + ] + }, + { + "cell_type": "markdown", + "id": "0c202c74", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "bb85ba21-51de-428b-93a3-e05cfe15297d", + "metadata": {}, + "source": [ + "## โญ๏ธ **Next:** Part 01 Feature Pipeline \n", + "\n", + "In the following notebook you will create feature groups and use on-demand transformations function to create on-demand features" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/transformation_functions/custom/1_feature_pipeline.ipynb b/advanced_tutorials/transformation_functions/custom/1_feature_pipeline.ipynb new file mode 100644 index 00000000..7e3b536e --- /dev/null +++ b/advanced_tutorials/transformation_functions/custom/1_feature_pipeline.ipynb @@ -0,0 +1,516 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c997f05a", + "metadata": { + "tags": [] + }, + "source": [ + "# **Hopsworks Feature Store** - Part 01: Feature Pipeline\n", + "\n", + "**Note**: This tutorial does not support Google Colab.\n", + "\n", + "This is the first part of the quick start series of tutorials about Hopsworks Feature Store. As part of this first module, you will work with data related to credit card transactions. \n", + "The objective of this tutorial is to demonstrate how to work with **on-demand transformation function** in the **Hopworks Feature Store** for online data with a goal of training and deploying a model that can predict fraudulent transactions.\n", + "\n", + "\n", + "## ๐Ÿ—’๏ธ This notebook is divided in 3 sections:\n", + "1. Loading the data and feature engineeing.\n", + "2. Create on-demand transformation functions.\n", + "4. Create feature groups with on-demand transformations and upload them to the Feature Store.\n", + "\n", + "![tutorial-flow](../../../images/01_featuregroups.png)\n", + "\n", + "First of all you will load the data and do some feature engineering on it." + ] + }, + { + "cell_type": "markdown", + "id": "3ebdad2e", + "metadata": {}, + "source": [ + "# ๐Ÿ“ Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1aa7ce8a", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -U hopsworks --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49806257", + "metadata": {}, + "outputs": [], + "source": [ + "from math import radians\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from features import transactions_fraud\n", + "\n", + "# Mute warnings\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "id": "f87d8f95", + "metadata": {}, + "source": [ + "First of all you will load the data and do some feature engineering on it." + ] + }, + { + "cell_type": "markdown", + "id": "8f8cd4a4-b552-4cc8-b489-c4c0df165846", + "metadata": {}, + "source": [ + "# ๐Ÿ“ Feature Pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "66d04213", + "metadata": {}, + "source": [ + "## ๐Ÿ’ฝ Loading the Data \n", + "\n", + "The data you will use comes from 2 different CSV files:\n", + "\n", + "- `transactions.csv`: events containing information about when a credit card was used, such as a timestamp, location, and the amount spent. A boolean fraud_label variable (True/False) tells us whether a transaction was fraudulent or not.\n", + "- `profiles.csv`: credit card user information such as birthdate and city of residence.\n", + "\n", + "In a production system, these CSV files would originate from separate data sources or tables, and probably separate data pipelines. **These files have a common credit card number column cc_num, which you will use later to join features together from the different datasets.**\n", + "\n", + "Now, you can go ahead and load the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27f2b52e", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the profiles data from a CSV file\n", + "profiles_df = pd.read_csv(\n", + " \"https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_online/profiles.csv\", \n", + " parse_dates=[\"birthdate\"],\n", + ")\n", + "\n", + "# Rename columns for clarity\n", + "profiles_df.columns = [\"name\", \"gender\", \"mail\", \"birthdate\", \"City\", \"Country\", \"cc_num\"]\n", + "\n", + "# Display the first three rows of the DataFrame\n", + "profiles_df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "713a9568", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the transactions data from a CSV file\n", + "trans_df = pd.read_csv(\n", + " \"https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_online/transactions.csv\", \n", + " parse_dates=[\"datetime\"],\n", + ")\n", + "\n", + "# Display the first three rows of the DataFrame\n", + "trans_df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ad0edf3", + "metadata": {}, + "outputs": [], + "source": [ + "# Filter transactions DataFrame to include only rows with category \"Cash Withdrawal\"\n", + "trans_df = trans_df[trans_df.category == \"Cash Withdrawal\"].reset_index(level=0, drop=True)\n", + "\n", + "# Fill missing values in the 'country' column with \"US\"\n", + "trans_df[\"country\"] = trans_df[\"country\"].fillna(\"US\")\n", + "\n", + "# Add birthdate to trans_df for \n", + "trans_df = trans_df.merge(profiles_df, on=\"cc_num\")[['tid', 'datetime', 'cc_num', 'category', 'amount', 'latitude',\n", + " 'longitude', 'city', 'country', 'fraud_label', 'birthdate']]\n", + "\n", + "# Filter profiles DataFrame to include only rows with credit card numbers present in the filtered transactions DataFrame\n", + "profiles_df = profiles_df[profiles_df.cc_num.isin(trans_df.cc_num.unique())].reset_index(level=0, drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8efc0deb", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort the transactions DataFrame by 'datetime' and 'cc_num'\n", + "trans_df.sort_values([\"datetime\", \"cc_num\"], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "fe5105a1", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "42b88055", + "metadata": {}, + "source": [ + "## ๐Ÿ› ๏ธ Feature Engineering \n", + "\n", + "Fraudulent transactions can differ from regular ones in many different ways. Typical red flags would for instance be a large transaction volume/frequency in the span of a few hours. It could also be the case that elderly people in particular are targeted by fraudsters. To facilitate model learning you will create additional features based on these patterns. In particular, you will create two types of features:\n", + "\n", + "1. **Features that aggregate data from multiple time steps**. An example of this could be the transaction frequency of a credit card in the span of a few hours, which is computed using a window function.\n", + "2. **Features that aggregate data from different data sources**. This could for instance be the age of a customer at the time of a transaction, which combines the `birthdate` feature from `profiles.csv` with the `datetime` feature from `transactions.csv`.\n", + "\n", + "Let's start with the first category." + ] + }, + { + "cell_type": "markdown", + "id": "99b27bbd", + "metadata": {}, + "source": [ + "Now you are ready to start by computing the distance between consecutive transactions, lets call it `loc_delta`.\n", + "Here you will use the [Haversine distance](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.haversine_distances.html?highlight=haversine#sklearn.metrics.pairwise.haversine_distances) to quantify the distance between two longitude and latitude coordinates." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f7d5009", + "metadata": {}, + "outputs": [], + "source": [ + "# Use the prepare_transactions_fraud function to process the trans_df DataFrame\n", + "trans_df = transactions_fraud.prepare_transactions_fraud(trans_df)\n", + "\n", + "# Display the first three rows of the modified DataFrame\n", + "trans_df.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "9a14693e-10b1-4b1b-b756-b99ed19b093e", + "metadata": {}, + "source": [ + "Next, we'll move on to the second category of features. Here, you'll calculate the age_at_transaction, which can be considered an on-demand feature." + ] + }, + { + "cell_type": "markdown", + "id": "38efde18-f5a6-411c-b53e-995fe0cb77b3", + "metadata": {}, + "source": [ + "### โšก๏ธ On-Demand Transformation Functions " + ] + }, + { + "cell_type": "markdown", + "id": "7018552b-1a34-4210-b330-7d29ade0efe0", + "metadata": {}, + "source": [ + "On-demand features are features that can only be computed at the time of an inference request, based on certain parameters available at that moment. You can learn more in the documentation available [here](https://docs.hopsworks.ai/latest/user_guides/fs/feature_group/on_demand_transformations/).\n", + "\n", + "To calculate the feature age_at_transaction, two parameters are needed: the transaction time and the date of birth of the person. The date of birth can be retrieved from an existing feature group, but the transaction time is only known when the inference request is made. As a result, the `age_at_transaction` feature is classified as an on-demand feature.\n", + "\n", + "Hopsworks enables the creation of on-demand features through on-demand transformation functions. On-demand transformation functions are created by attaching transformation function to feature groups within Hopsworks.\n", + "\n", + "To create a transformation function, you need to use the `@hopsworks.udf` decorator. Let's start by importing the Hopsworks library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8196ffe5-9ecb-4cdf-a7a7-3df8a9f18ca2", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks" + ] + }, + { + "cell_type": "markdown", + "id": "81bd28da-fec4-439c-b9a3-4fdbcee30cfa", + "metadata": {}, + "source": [ + "Now, let's create an transformation function for computing the on-demand feature `age_at_transaction`. The transformation function below creates the on-demand feature `age_at_transaction`. Once the computation is complete, the `birthdate` is dropped to not included in the feature group, since it is already stored in the another feature group." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71cdc812-08ef-4381-a535-d1baa4f72803", + "metadata": {}, + "outputs": [], + "source": [ + "@hopsworks.udf(return_type=float, drop=[\"birthdate\"])\n", + "def age_at_transaction(datetime, birthdate):\n", + " return (datetime - birthdate).dt.days / 365" + ] + }, + { + "cell_type": "markdown", + "id": "aad1b947-ffc6-489f-ab06-75bbd5d9deb5", + "metadata": {}, + "source": [ + "Now, let's test the transformation function we've defined. To do this, you'll first need to establish a connection to Hopsworks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2482f143-ecc0-48ef-b6e9-6b881a0a42b4", + "metadata": {}, + "outputs": [], + "source": [ + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "813c6415-a6f0-4e39-9344-7a664814830b", + "metadata": {}, + "outputs": [], + "source": [ + "age_at_transaction.output_column_names = \"age_at_transaction\"\n", + "\n", + "test_df = pd.DataFrame({\n", + " 'transaction_time': pd.to_datetime(['2022-01-01', '2022-01-15']),\n", + " 'data_of_birth': pd.to_datetime(['1998-03-21', '2000-01-30'])\n", + "})\n", + "\n", + "age_at_transaction.get_udf()(test_df['transaction_time'], test_df['data_of_birth'])" + ] + }, + { + "cell_type": "markdown", + "id": "74e826bb", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "ab3ac23b", + "metadata": {}, + "source": [ + "## ๐Ÿช„ Creating Feature Groups \n", + "\n", + "A [feature group](https://docs.hopsworks.ai/3.0/concepts/fs/feature_group/fg_overview/) can be seen as a collection of conceptually related features. In this case, you will create a feature group for the transaction data and a feature group for the windowed aggregations on the transaction data. Both will have `cc_num` as primary key, which will allow you to join them when creating a dataset in the next tutorial.\n", + "\n", + "Feature groups can also be used to define a namespace for features. For instance, in a real-life setting you would likely want to experiment with different window lengths. In that case, you can create feature groups with identical schema for each window length. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7af46c39", + "metadata": {}, + "outputs": [], + "source": [ + "fs.name" + ] + }, + { + "cell_type": "markdown", + "id": "15b742ad", + "metadata": {}, + "source": [ + "To create a feature group you need to give it a name and specify a primary key. It is also good to provide a description of the contents of the feature group and a version number, if it is not defined it will automatically be incremented to `1`. \n", + "\n", + "To add the on-demand feature `age_at_transaction` to a feature group, you must create an on-demand transformation function by attaching the previously defined `age_at_transaction` transformation function to the feature group. The features to be passed to the transformation function can either be explicitly specified as parameters or, if not provided, the function will automatically use features from the feature group that match the names of the function's arguments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e926dc7", + "metadata": {}, + "outputs": [], + "source": [ + "# Get or create the 'transactions_fraud_online_fg' feature group\n", + "trans_fg = fs.get_or_create_feature_group(\n", + " name=\"transactions_fraud_online_fg\",\n", + " version=1,\n", + " description=\"Transaction data\",\n", + " primary_key=['cc_num'],\n", + " event_time='datetime',\n", + " # Attacthing transformation function `age_at_transaction` to the feature group to create on-demand feature `age_at_transaction`\n", + " transformation_functions=[age_at_transaction],\n", + " online_enabled=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a16ae49d", + "metadata": {}, + "source": [ + "Here you have also set `online_enabled=True`, which enables low latency access to the data. A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).\n", + "\n", + "At this point, you have only specified some metadata for the feature group. It does not store any data or even have a schema defined for the data. To make the feature group persistent you need to populate it with its associated data using the `insert` function." + ] + }, + { + "cell_type": "markdown", + "id": "9c6d1428-fd2d-4333-9ba6-ea6b395f2f55", + "metadata": {}, + "source": [ + "When inserting data into a feature group with an on-demand transformation function, you have to include all the features required for the transformation in the DataFrame being inserted. \n", + "\n", + "Hopsworks computes all on-demand features using the transformation function when data is inserted into the feature group, allowing for backfilling of on-demand features. This backfilling process reduces the computational effort required for creating training data, as these transformations do not need to be applied repeatedly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a366430", + "metadata": {}, + "outputs": [], + "source": [ + "# Insert data into feature group\n", + "trans_fg.insert(trans_df)\n", + "print('โœ… Done!')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d7de1db", + "metadata": {}, + "outputs": [], + "source": [ + "# Update feature descriptions\n", + "feature_descriptions = [\n", + " {\"name\": \"tid\", \"description\": \"Transaction id\"},\n", + " {\"name\": \"datetime\", \"description\": \"Transaction time\"},\n", + " {\"name\": \"cc_num\", \"description\": \"Number of the credit card performing the transaction\"},\n", + " {\"name\": \"amount\", \"description\": \"Dollar amount of the transaction\"},\n", + " {\"name\": \"country\", \"description\": \"Country in which the transaction was made\"},\n", + " {\"name\": \"fraud_label\", \"description\": \"Whether the transaction was fraudulent or not\"},\n", + " {\"name\": \"loc_delta_t_minus_1\", \"description\": \"Location of previous transaction\"},\n", + " {\"name\": \"time_delta_t_minus_1\", \"description\": \"Time of previous transaction\"},\n", + " {\"name\": \"age_at_transaction\", \"description\": \"Age of user at the time the transaction has been performed\"},\n", + "]\n", + "\n", + "for desc in feature_descriptions: \n", + " trans_fg.update_feature_description(desc[\"name\"], desc[\"description\"])" + ] + }, + { + "cell_type": "markdown", + "id": "18025f56-0ef8-4b2f-adf9-2e47b68e6efb", + "metadata": {}, + "source": [ + "You can now check the UI to see that the on-demand feature `age_at_transaction` is also present in the feature group along with other feature. On-demand features in the feature group can also be used as normal feature while creating feature view for model training and inference. You will see this in the following notebook.\n", + "\n", + "![tutorial-flow](images/on_demand_example.png)" + ] + }, + { + "cell_type": "markdown", + "id": "ffbe721c", + "metadata": {}, + "source": [ + "You can move on and do the same thing for the profile and label feature groups." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8027f2d", + "metadata": {}, + "outputs": [], + "source": [ + "# Get or create the 'profile_fraud_online_fg' feature group\n", + "profile_fg = fs.get_or_create_feature_group(\n", + " name=\"profile_fraud_online_fg\",\n", + " version=1,\n", + " description=\"Credit card holder demographic data\",\n", + " primary_key=['cc_num'],\n", + " online_enabled=True,\n", + ")\n", + "# Insert data into feature group\n", + "profile_fg.insert(profiles_df)\n", + "print('โœ… Done!')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef348581", + "metadata": {}, + "outputs": [], + "source": [ + "# Update feature descriptions\n", + "feature_descriptions = [\n", + " {\"name\": \"cc_num\", \"description\": \"Number of the credit card performing the transaction\"},\n", + " {\"name\": \"gender\", \"description\": \"Gender of the credit card holder\"},\n", + "]\n", + "\n", + "for desc in feature_descriptions: \n", + " profile_fg.update_feature_description(desc[\"name\"], desc[\"description\"])" + ] + }, + { + "cell_type": "markdown", + "id": "c65cde95", + "metadata": {}, + "source": [ + "## โญ๏ธ **Next:** Part 02 Training Pipeline \n", + "\n", + "In the following notebook you will use our feature groups to create a dataset you can train a model on." + ] + } + ], + "metadata": { + "interpreter": { + "hash": "e1ddeae6eefc765c17da80d38ea59b893ab18c0c0904077a035ef84cfe367f83" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/transformation_functions/custom/2_training_pipeline.ipynb b/advanced_tutorials/transformation_functions/custom/2_training_pipeline.ipynb new file mode 100644 index 00000000..f5641c58 --- /dev/null +++ b/advanced_tutorials/transformation_functions/custom/2_training_pipeline.ipynb @@ -0,0 +1,746 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "# **Hopsworks Feature Store** - Part 02: Training Pipeline\n", + "\n", + "\n", + "This notebook explains how to read from a feature group, create training dataset within the feature store, train a model and save it to model registry.\n", + "\n", + "## ๐Ÿ—’๏ธ This notebook is divided into the following sections:\n", + "\n", + "1. Fetch Feature Groups.\n", + "2. Define Model-Dependent Transformation functions.\n", + "3. Create Feature Views.\n", + "4. Create Training Dataset with training, validation and test splits.\n", + "5. Train the model.\n", + "6. Register model in Hopsworks Model Registry.\n", + "7. Create the Deployment.\n", + "\n", + "![part2](../../../images/02_training-dataset.png) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ“ Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -U xgboost --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "import os\n", + "import time\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "from matplotlib import pyplot\n", + "import seaborn as sns\n", + "\n", + "import xgboost as xgb\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import f1_score\n", + "\n", + "# Mute warnings\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ”ช Feature Selection \n", + "\n", + "You will start by selecting all the features you want to include for model training/inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve feature groups.\n", + "trans_fg = fs.get_feature_group(\n", + " name='transactions_fraud_online_fg', \n", + " version=1,\n", + ")\n", + "profile_online_fg = fs.get_feature_group(\n", + " name='profile_fraud_online_fg', \n", + " version=1,\n", + ")\n", + "\n", + "# Select features for training dataset\n", + "selected_features = trans_fg.select_features().join(profile_online_fg.select([\"gender\", \"birthdate\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment this if you would like to view your selected features, You can s\n", + "#selected_features.show(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Recall that you computed the features in `transactions_fraud_online_fg`. If you had created multiple feature groups with identical schema for different window lengths, and wanted to include them in the join you would need to include a prefix argument in the join to avoid feature name clash. See the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/query_api/#join) for more details." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ๐Ÿค– Model-Dependent Transformation Functions \n", + "\n", + "Letโ€™s now apply some model-dependent transformations to our data. These transformations are specific to particular models and typically rely on the statistics of the training dataset. Hopsworks enables you to define custom transformation functions, like the `scaler` function we created earlier, which use training dataset statistics to perform model-specific transformations. Additionally, Hopsworks offers built-in transformation functions, such as `label_encoder`, `min_max_scaler`, `robust_scaler`, `standard_scaler`, and `one_hot_encoder`.\n", + "\n", + "You can explore more about model-dependent transformation functions [here]((https://docs.hopsworks.ai/latest/user_guides/fs/feature_view/model-dependent-transformations/)).\n", + "\n", + "By using Hopsworks to create model-dependent transformation functions, you can build **skew-free** AI pipelines. Hopsworks ensure the consistent application of the same model-depenendent transformation and statistics when generating training data, or during reading data for batch or online inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import built in transformation functions\n", + "from hopsworks.hsfs.builtin_transformations import label_encoder\n", + "\n", + "# Load the transformation functions saved in the feature store.\n", + "scaler = fs.get_transformation_function(name=\"scaler\")\n", + "\n", + "# Map features to transformation functions.\n", + "transformation_functions = [\n", + " label_encoder(\"country\"),\n", + " label_encoder(\"gender\"),\n", + " scaler(\"loc_delta_t_plus_1\"),\n", + " scaler(\"loc_delta_t_minus_1\"),\n", + " scaler(\"time_delta_t_minus_1\"),\n", + " scaler(\"age_at_transaction\"),\n", + " scaler(\"amount\"),\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## โš™๏ธ Feature View Creation \n", + "\n", + "The Feature Views allows schema in form of a query with filters, define a model target feature/label and additional transformation functions.\n", + "In order to create or get a Feature View you may use `fs.get_or_create_feature_view()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get or create the 'transactions_fraud_online_fv' feature view\n", + "feature_view = fs.get_or_create_feature_view(\n", + " name='transactions_fraud_online_fv',\n", + " version=1,\n", + " query=selected_features,\n", + " labels=[\"fraud_label\"],\n", + " transformation_functions=transformation_functions,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The feature view should now be created and visible in the UI, along with all the transformation functions. You will notice that the on-demand feature `age_at_transaction` is displayed alongside the other features in the feature view, and it has the model-dependent transformation function `scaler` attached to it.\n", + "\n", + "![fg-overview](./images/fv_mdt_odt.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ‹๏ธ Training Dataset " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Training/Test splits, datasets creation. Using timerange arguments.\n", + "train_start = \"2022/01/01\"\n", + "train_end = \"2022/03/10\"\n", + "test_start = \"2022/03/10\"\n", + "test_end = \"2022/03/31\"\n", + "\n", + "X_train, X_test, y_train, y_test = feature_view.train_test_split(\n", + " train_start=train_start,\n", + " train_end=train_end,\n", + " test_start=test_start,\n", + " test_end=test_end,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop the specified columns from the X_train\n", + "X_train.drop([\"tid\", \"birthdate\"], axis=1, inplace=True)\n", + "\n", + "# Drop the specified columns from the X_test\n", + "X_test.drop([\"tid\", \"birthdate\"], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display the normalized value counts of the training labels (y_train)\n", + "y_train.value_counts(normalize=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that the distribution is extremely skewed, which is natural considering that fraudulent transactions make up a tiny part of all transactions. Thus you should somehow address the class imbalance. There are many approaches for this, such as weighting the loss function, over- or undersampling, creating synthetic data, or modifying the decision threshold. In this example, you will use the simplest method which is to just supply a class weight parameter to our learning algorithm. The class weight will affect how much importance is attached to each class, which in our case means that higher importance will be placed on positive (fraudulent) samples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿงฌ Modeling\n", + "\n", + "Next you will train a model. Here, you set larger class weight for the positive class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# Initialize an XGBoost classifier\n", + "model = xgb.XGBClassifier()\n", + "\n", + "# Train the classifier using the training features (X_train) and labels (y_train)\n", + "model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Predict the training set\n", + "y_pred_train = model.predict(X_train)\n", + "\n", + "# Predict the test set\n", + "y_pred_test = model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_test.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute f1 score\n", + "metrics = {\n", + " \"f1_score\": f1_score(y_test, y_pred_test, average='macro')\n", + "}\n", + "metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate the confusion matrix for the test set predictions\n", + "results = confusion_matrix(\n", + " y_test, \n", + " y_pred_test, \n", + " labels=[False, True],\n", + ")\n", + "\n", + "# Print the confusion matrix\n", + "print(results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a DataFrame from the confusion matrix results with labeled rows and columns\n", + "df_cm = pd.DataFrame(\n", + " results, \n", + " ['True Normal', 'True Fraud'],\n", + " ['Pred Normal', 'Pred Fraud'],\n", + ")\n", + "\n", + "# Create a heatmap using seaborn with annotations\n", + "cm = sns.heatmap(df_cm, annot=True)\n", + "\n", + "# Get the figure from the heatmap\n", + "fig = cm.get_figure()\n", + "\n", + "# Display the figure\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### โš™๏ธ Model Schema\n", + "\n", + "The model needs to be set up with a [Model Schema](https://docs.hopsworks.ai/3.0/user_guides/mlops/registry/model_schema/), which describes the inputs and outputs for a model.\n", + "\n", + "A Model Schema can be automatically generated from training examples, as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from hsml.schema import Schema\n", + "from hsml.model_schema import ModelSchema\n", + "\n", + "# Create a Schema for the input features\n", + "input_schema = Schema(X_train)\n", + "\n", + "# Create a Schema for the output labels\n", + "output_schema = Schema(y_train)\n", + "\n", + "# Create a ModelSchema using the input and output schemas\n", + "model_schema = ModelSchema(\n", + " input_schema=input_schema, \n", + " output_schema=output_schema,\n", + ")\n", + "\n", + "# Convert the ModelSchema to a dictionary representation\n", + "model_schema.to_dict()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ“ Register model\n", + "\n", + "One of the features in Hopsworks is the model registry. This is where we can store different versions of models and compare their performance. Models from the registry can then be served as API endpoints." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the directory name for the model registry\n", + "model_dir = \"fraud_online_model\"\n", + "\n", + "# Check if the directory exists, and create it if not\n", + "if not os.path.isdir(model_dir):\n", + " os.mkdir(model_dir)\n", + "\n", + "# Save the trained XGBoost model to a file within the model directory\n", + "joblib.dump(model, f\"{model_dir}/xgboost_fraud_online_model.pkl\")\n", + "\n", + "# Save the confusion matrix plot to an image file within the model directory\n", + "fig.savefig(f\"{model_dir}/confusion_matrix.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the model registry\n", + "mr = project.get_model_registry()\n", + "\n", + "# Create a Python model in the model registry\n", + "fraud_model = mr.python.create_model(\n", + " name=\"xgboost_fraud_online_model\", \n", + " metrics=metrics, # Specify the metrics used to evaluate the model\n", + " model_schema=model_schema, # Provide the model schema\n", + " input_example=[4467360740682089], # Example input for testing deployments\n", + " description=\"Fraud Online Predictor\",# Add a description for the model\n", + ")\n", + "\n", + "# Save the model to the specified model directory\n", + "fraud_model.save(model_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## ๐Ÿš€ Model Deployment\n", + "\n", + "\n", + "### About Model Serving\n", + "Models can be served via KFServing or \"default\" serving, which means a Docker container exposing a Flask server. For KFServing models, or models written in Tensorflow, you do not need to write a prediction file (see the section below). However, for sklearn models using default serving, you do need to proceed to write a prediction file.\n", + "\n", + "In order to use KFServing, you must have Kubernetes installed and enabled on your cluster." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ๐Ÿ“Ž Predictor script for Python models\n", + "\n", + "\n", + "Scikit-learn and XGBoost models are deployed as Python models, in which case you need to provide a **Predict** class that implements the **predict** method. The **predict()** method invokes the model on the inputs and returns the prediction as a list. \n", + "\n", + "Remember that the feature view contains the on-demand feature `age_at_transaction`, which requires the `datetime` parameter that denotes the transaction time for its computation. This transaction time is only known at the time of online inference. The **predict()** method extracts this request parameter and passes it as an argument to the **get_feature_vector** function to compute the on-demand feature `age_at_transaction`. Hopsworks ensures that the same on-demand transformation function is used to compute the feature during online inference, eliminating the potential for skew.\n", + "\n", + "The **init()** method is run when the predictor is loaded into memory, loading the model from the local directory it is materialized to, *ARTIFACT_FILES_PATH*.\n", + "\n", + "The directive \"%%writefile\" writes out the cell before to the given Python file. We will use the **predict_example.py** file to create a deployment for our model. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile predict_example.py\n", + "import os\n", + "import numpy as np\n", + "import hsfs\n", + "import joblib\n", + "from datetime import datetime\n", + "\n", + "\n", + "class Predict(object):\n", + "\n", + " def __init__(self):\n", + " \"\"\" Initializes the serving state, reads a trained model\"\"\" \n", + " # Get feature store handle\n", + " fs_conn = hsfs.connection()\n", + " self.fs = fs_conn.get_feature_store()\n", + " \n", + " # Get feature view\n", + " self.fv = self.fs.get_feature_view(\n", + " name=\"transactions_fraud_online_fv\", \n", + " version=1,\n", + " )\n", + " \n", + " # Initialize serving\n", + " self.fv.init_serving(1)\n", + "\n", + " # Load the trained model\n", + " self.model = joblib.load(os.environ[\"ARTIFACT_FILES_PATH\"] + \"/xgboost_fraud_online_model.pkl\")\n", + " print(\"Initialization Complete\")\n", + "\n", + " def predict(self, inputs):\n", + " \"\"\" Serves a prediction request usign a trained model\"\"\"\n", + " transaction_time = datetime.strptime(inputs[0][1], \"%m/%d/%Y, %H:%M:%S\")\n", + " feature_vector = self.fv.get_feature_vector({\"cc_num\": inputs[0][0]}, request_parameters={\"datetime\":transaction_time})\n", + " indexes_to_remove = [0,1]\n", + " feature_vector = [\n", + " i \n", + " for j, i \n", + " in enumerate(feature_vector) \n", + " if j not in indexes_to_remove\n", + " ] \n", + " return self.model.predict(np.asarray(feature_vector).reshape(1, -1)).tolist() # Numpy Arrays are not JSON serializable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you wonder why we use the path Models/fraud_tutorial_model/1/model.pkl, it is useful to know that the Data Sets tab in the Hopsworks UI lets you browse among the different files in the project. Registered models will be found underneath the Models directory. Since you saved you model with the name fraud_tutorial_model, that's the directory you should look in. 1 is just the version of the model you want to deploy.\n", + "\n", + "This script needs to be put into a known location in the Hopsworks file system. Let's call the file predict_example.py and put it in the Models directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the dataset API for the current project\n", + "dataset_api = project.get_dataset_api()\n", + "\n", + "# Specify the local file path of the Python script to be uploaded\n", + "local_script_path = \"predict_example.py\"\n", + "\n", + "# Upload the Python script to the \"Models\", and overwrite if it already exists\n", + "uploaded_file_path = dataset_api.upload(local_script_path, \"Models\", overwrite=True)\n", + "\n", + "# Create the full path to the uploaded script for future reference\n", + "predictor_script_path = os.path.join(\"/Projects\", project.name, uploaded_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the deployment\n", + "Here, you fetch the model you want from the model registry and define a configuration for the deployment. For the configuration, you need to specify the serving type (default or KFserving)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Deploy the fraud model\n", + "deployment = fraud_model.deploy(\n", + " name=\"fraudonlinemodeldeployment\", # Specify a name for the deployment\n", + " script_file=predictor_script_path, # Provide the path to the Python script for prediction\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# Print the name of the deployment\n", + "print(\"Deployment: \" + deployment.name)\n", + "\n", + "# Display information about the deployment\n", + "deployment.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Deployment is warming up...\")\n", + "time.sleep(45)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### The deployment has now been registered. However, to start it you need to run the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# Start the deployment and wait for it to be in a running state for up to 300 seconds\n", + "deployment.start(await_running=300)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the current state of the deployment\n", + "deployment.get_state().describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# To troubleshoot you can use `get_logs()` method\n", + "deployment.get_logs(component='predictor')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stop Deployment\n", + "To stop the deployment you simply run:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Stop the deployment and wait for it to be in a stopped state for up to 180 seconds\n", + "deployment.stop(await_stopped=180)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## โญ๏ธ **Next:** Part 03: Inference Pipeline\n", + "\n", + "In the following notebook you will use your model for Serving Vector Inference.\n" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/advanced_tutorials/transformation_functions/custom/3_inference_pipeline.ipynb b/advanced_tutorials/transformation_functions/custom/3_inference_pipeline.ipynb new file mode 100644 index 00000000..0b933bc2 --- /dev/null +++ b/advanced_tutorials/transformation_functions/custom/3_inference_pipeline.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d28eba60", + "metadata": {}, + "source": [ + "# **Hopsworks Feature Store** - Part 03: Inference Pipeline\n" + ] + }, + { + "cell_type": "markdown", + "id": "f16367c8", + "metadata": {}, + "source": [ + "## ๐Ÿ“ก Connecting to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed952ece", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "e98e32ce", + "metadata": {}, + "source": [ + "## โš™๏ธ Feature Group Retrieval\n", + "Let's retrieve a feature group in order to get cc_num values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2a8475b", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the 'transactions_fraud_online_fg' feature group\n", + "trans_fg = fs.get_feature_group(\n", + " 'transactions_fraud_online_fg',\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c37c5197", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the first 5 unique credit card numbers (cc_nums)\n", + "cc_nums = trans_fg.select('cc_num').show(5).cc_num.values\n", + "\n", + "# Display the obtained cc_nums\n", + "cc_nums" + ] + }, + { + "cell_type": "markdown", + "id": "6d5dade0", + "metadata": {}, + "source": [ + "## ๐Ÿ—„ Model Registry\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be66f4c8", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the Model Registry\n", + "mr = project.get_model_registry()" + ] + }, + { + "cell_type": "markdown", + "id": "903df073", + "metadata": {}, + "source": [ + "## ๐Ÿš€ Fetch Deployment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4303ac82", + "metadata": {}, + "outputs": [], + "source": [ + "# Access the Model Serving\n", + "ms = project.get_model_serving()\n", + "\n", + "# Specify the deployment name\n", + "deployment_name = \"fraudonlinemodeldeployment\"\n", + "\n", + "# Get the deployment with the specified name\n", + "deployment = ms.get_deployment(deployment_name)\n", + "\n", + "# Start the deployment and wait for it to be in a running state for up to 300 seconds\n", + "deployment.start(await_running=300)" + ] + }, + { + "cell_type": "markdown", + "id": "045ba7e4", + "metadata": {}, + "source": [ + "## ๐Ÿ”ฎ Predicting using deployment\n", + "\n", + "\n", + "Finally you can start making predictions with your model!\n", + "\n", + "Send inference requests to the deployed model as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42196023", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the first credit card number\n", + "cc_num = cc_nums[0]\n", + "cc_num" + ] + }, + { + "cell_type": "markdown", + "id": "188f959f-03b8-428a-99af-84b342c8777e", + "metadata": {}, + "source": [ + "Remember that the feature view includes the on-demand feature `age_at_transaction`, which requires the transaction time request parameter for its computation. To simplify the process, letโ€™s create a simulated transaction time that can be used to compute this on-demand feature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22a004b7-67ce-4ba2-9bcb-ca3135bedf23", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "transaction_time = datetime.now().strftime(\"%m/%d/%Y, %H:%M:%S\")" + ] + }, + { + "cell_type": "markdown", + "id": "80323999-c75f-4dfc-959a-4a9a6feac175", + "metadata": {}, + "source": [ + "Now, let's pass the request parameter `transaction_time` along with `cc_num` to the predict function so that the on-demand feature `age_at_transaction` can be computed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "596f3241", + "metadata": {}, + "outputs": [], + "source": [ + "# Make a prediction\n", + "deployment.predict(\n", + " inputs=[int(cc_num), transaction_time],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f260b7b6", + "metadata": {}, + "outputs": [], + "source": [ + "# Predict for several cc_nums\n", + "predictions = [\n", + " deployment.predict(inputs=[int(cc_num), transaction_time])['predictions'] \n", + " for cc_num\n", + " in cc_nums\n", + "]\n", + "predictions" + ] + }, + { + "cell_type": "markdown", + "id": "0b02e2bd", + "metadata": {}, + "source": [ + "### Stop Deployment\n", + "To stop the deployment you simply run:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b90b4c19", + "metadata": {}, + "outputs": [], + "source": [ + "# Stop the deployment\n", + "deployment.stop(await_stopped=180)" + ] + }, + { + "cell_type": "markdown", + "id": "8d98d2a0", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### ๐Ÿฅณ Next Steps \n", + "Congratulations you've now completed the Transformation Functions tutorial in Hopsworks.\n", + "\n", + "Check out our other tutorials on โžก https://github.com/logicalclocks/hopsworks-tutorials\n", + "\n", + "Or documentation at โžก https://docs.hopsworks.ai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e99b1614-686e-4310-bd5c-92821c76ca56", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/transformation_functions/custom/custom_transformation_functions.ipynb b/advanced_tutorials/transformation_functions/custom/custom_transformation_functions.ipynb deleted file mode 100644 index fc404587..00000000 --- a/advanced_tutorials/transformation_functions/custom/custom_transformation_functions.ipynb +++ /dev/null @@ -1,916 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "d6fb35cf", - "metadata": {}, - "source": [ - "# ๐Ÿ‘จ๐Ÿปโ€๐Ÿซ Custom Transformation Functions\n", - "\n", - "In this tutorial you will learn how to **register custom transformation functions in hopsworks feature store** and use them in **training and inference pipelines**." - ] - }, - { - "cell_type": "markdown", - "id": "a19fd127", - "metadata": {}, - "source": [ - "## ๐Ÿ—„๏ธ Table of Contents\n", - "- [๐Ÿ“ Imports](#1)\n", - "- [โ›ณ๏ธ Feature Pipeline](#t1)\n", - " - [๐Ÿ’ฝ Loading Data](#2)\n", - " - [๐Ÿ”ฎ Connecting to Hopsworks Feature Store](#3)\n", - " - [๐Ÿช„ Creating Feature Groups](#4)\n", - "- [โ›ณ๏ธ Training Pipeline](#t2)\n", - " - [๐Ÿ‘ฉ๐Ÿปโ€๐Ÿ”ฌ Custom Transformation Functions](#12)\n", - " - [โœ๐Ÿป Registering Custom Transformation Functions in Hopsworks](#5)\n", - " - [๐Ÿ– Feature View Creation](#6)\n", - " - [๐Ÿงฌ Modeling](#7)\n", - " - [๐Ÿ’พ Saving the Model in the Model Registry](#8)\n", - "- [โ›ณ๏ธ Inference Pipeline](#t3)\n", - " - [๐Ÿ“ฎ Retrieving the Model from the Model Registry](#9)\n", - " - [๐Ÿ‘จ๐Ÿปโ€โš–๏ธ Batch Prediction](#10)\n", - " - [๐Ÿ‘จ๐Ÿปโ€โš–๏ธ Real-time Predictions](#11)" - ] - }, - { - "cell_type": "markdown", - "id": "3cc6a7e9", - "metadata": {}, - "source": [ - "\n", - "## ๐Ÿ“ Imports " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22c08e9f", - "metadata": {}, - "outputs": [], - "source": [ - "# Importing necessary libraries\n", - "import pandas as pd # For data manipulation and analysis using DataFrames\n", - "import numpy as np # For numerical computations and arrays\n", - "import os # For operating system-related functions\n", - "import joblib # For saving and loading model files\n", - "\n", - "import xgboost as xgb # For using the XGBoost machine learning library\n", - "from sklearn.metrics import accuracy_score # For evaluating model accuracy" - ] - }, - { - "cell_type": "markdown", - "id": "97bc8784", - "metadata": {}, - "source": [ - "---\n", - "\n", - "# โ›ณ๏ธ Feature Pipeline \n", - "\n", - "In this section you will load data, create a Hopsworks feature group and insert your dataset into created feature group." - ] - }, - { - "cell_type": "markdown", - "id": "4562f488", - "metadata": {}, - "source": [ - "\n", - "## ๐Ÿ’ฝ Loading Data \n", - "\n", - "To begin with, let's load a dataset which contains air quality measurements for different cities from 2013-01-01 to 2023-04-11." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1cfebd42", - "metadata": {}, - "outputs": [], - "source": [ - "# Load the data\n", - "df_original = pd.read_csv(\"https://repo.hops.works/dev/davit/air_quality/backfill_pm2_5_eu.csv\")\n", - "df_original.head(3)" - ] - }, - { - "cell_type": "markdown", - "id": "031974ca", - "metadata": {}, - "source": [ - "Now let's add a target variable to the DataFrame. For simplicity and for demonstration purposes you will randomly assign either a 0 or a 1 to each row." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "411520b2", - "metadata": {}, - "outputs": [], - "source": [ - "# Generate a binary target column\n", - "df_original['target'] = np.random.choice(\n", - " [0, 1], \n", - " size=len(df_original),\n", - ")\n", - "df_original.head(3)" - ] - }, - { - "cell_type": "markdown", - "id": "b2f4b822", - "metadata": {}, - "source": [ - "\n", - "## ๐Ÿ”ฎ Connecting to Hopsworks Feature Store \n", - "\n", - "The next step is to login to the Hopsworks platform. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49800275", - "metadata": {}, - "outputs": [], - "source": [ - "import hopsworks\n", - "\n", - "project = hopsworks.login()\n", - "\n", - "fs = project.get_feature_store() " - ] - }, - { - "cell_type": "markdown", - "id": "60c9e83b", - "metadata": {}, - "source": [ - "\n", - "## ๐Ÿช„ Creating Feature Groups\n", - "\n", - "Now you need to create a Feature Group and insert your dataset.\n", - "\n", - "You will use `.get_or_create_feature_group()` method of the feature store object.\n", - "\n", - "You can read about **Feature Groups** [here](https://docs.hopsworks.ai/3.2/concepts/fs/feature_group/fg_overview/)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6e5c898", - "metadata": {}, - "outputs": [], - "source": [ - "feature_group = fs.get_or_create_feature_group(\n", - " name='feature_group_online',\n", - " description='Online Feature Group',\n", - " version=1,\n", - " primary_key=['city_name', 'date'],\n", - " online_enabled=True,\n", - ") \n", - "feature_group.insert(df_original)" - ] - }, - { - "cell_type": "markdown", - "id": "eca75c35", - "metadata": {}, - "source": [ - "---\n", - "\n", - "# โ›ณ๏ธ Training Pipeline \n", - "\n", - "In the **Training Pipeline** you will register custom transformation functions in the Hopsworks Feature Store, apply them to specific columns in the feature view, split dataset into train and test and train the XGBClassifier. Then you will register your trained model in the Hopsworks Model Registry." - ] - }, - { - "cell_type": "markdown", - "id": "06b8350d", - "metadata": {}, - "source": [ - "\n", - "## ๐Ÿ‘ฉ๐Ÿปโ€๐Ÿ”ฌ Custom Transformation Functions\n", - "\n", - "In the `transformations.py` file you can find the custom `encode_city_name` and `scale_pm2_5` transformation functions which will be registered in the Hopsworks Feature Store and then attached to feature view during feature view creation for further data transformation.\n", - "\n", - "Let's import them and see how they work." - ] - }, - { - "cell_type": "markdown", - "id": "7c66cd33", - "metadata": {}, - "source": [ - "If your code is running internally within Hopsworks, to register custom transformation functions in the feature store they need to be either part of the library installed in Hopsworks or attached when starting a Jupyter notebook or Hopsworks job.\n", - "\n", - "Uncomment the next cell to download `transformations` file with custom transformation functions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a3dc2540", - "metadata": {}, - "outputs": [], - "source": [ - "#!wget https://raw.githubusercontent.com/logicalclocks/hopsworks-tutorials/master/advanced_tutorials/transformation_functions/custom/transformations.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1dbb42d0", - "metadata": {}, - "outputs": [], - "source": [ - "from transformations import encode_city_name, scale_pm2_5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52881e65", - "metadata": {}, - "outputs": [], - "source": [ - "city_name = 'Madrid'\n", - "encoded_city_name = encode_city_name(city_name)\n", - "print(\"โ›ณ๏ธ Encoded City Name:\", encoded_city_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd0bf161", - "metadata": {}, - "outputs": [], - "source": [ - "pm2_5_value = 13.0\n", - "scaled_pm2_5 = scale_pm2_5(pm2_5_value)\n", - "print(\"โ›ณ๏ธ Scaled PM2.5 Value:\", scaled_pm2_5)" - ] - }, - { - "cell_type": "markdown", - "id": "679112d9", - "metadata": {}, - "source": [ - "\n", - "## โœ๐Ÿป Registering Custom Transformation Functions in Hopsworks\n", - "\n", - "The next step is to **register custom transformation functions** in Hopsworks Feature Store.\n", - "\n", - "You can check existing transformation functions in feature store using the `.get_transformation_functions()` method." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23d9adb7", - "metadata": {}, - "outputs": [], - "source": [ - "# Check existing transformation functions\n", - "fns = [\n", - " fn.name \n", - " for fn \n", - " in fs.get_transformation_functions()\n", - "]\n", - "fns" - ] - }, - { - "cell_type": "markdown", - "id": "f1ffb06c", - "metadata": {}, - "source": [ - "You can register your transformation function using the `.create_transformation_function()` method with the next parameters:\n", - "\n", - "- `transformation_function` - your custom transformation function.\n", - "\n", - "- `output_type` - python or numpy output type that will be inferred as pyspark.sql.types type.\n", - "\n", - "- `version` - version of your custom transformation function.\n", - "\n", - "Then don't forget to use the `.save()` method in order to persist transformation function in the backend." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f187f4f5", - "metadata": {}, - "outputs": [], - "source": [ - "# Register encode_city_name in Hopsworks\n", - "if \"encode_city_name\" not in fns:\n", - " encoder = fs.create_transformation_function(\n", - " encode_city_name, \n", - " output_type=int,\n", - " version=1,\n", - " )\n", - " encoder.save()\n", - " \n", - "# Register scale_pm2_5 in Hopsworks\n", - "if \"scale_pm2_5\" not in fns:\n", - " scaler = fs.create_transformation_function(\n", - " scale_pm2_5, \n", - " output_type=float,\n", - " version=1,\n", - " )\n", - " scaler.save()" - ] - }, - { - "cell_type": "markdown", - "id": "d229ab74", - "metadata": {}, - "source": [ - "Now let's check if your custom transformation functions are present in the feature store." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "115f5ebc", - "metadata": {}, - "outputs": [], - "source": [ - "# Check it your transformation functions are present in the feature store\n", - "fns = [\n", - " fn.name \n", - " for fn \n", - " in fs.get_transformation_functions()\n", - "]\n", - "fns" - ] - }, - { - "cell_type": "markdown", - "id": "a6b8cf14", - "metadata": {}, - "source": [ - "\n", - "## ๐Ÿ– Feature View Creation\n", - "\n", - "In this part you will retrieve your custom transformation functions from the feature store, build a Query object and create a feature view.\n", - "\n", - "To retrieve your custom transformation function you need to use the `.get_transformation_function()` method by specifying the **name** and **version**." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cd0a0c45", - "metadata": {}, - "outputs": [], - "source": [ - "# Retrieve encode_city_name transformation function\n", - "encoder = fs.get_transformation_function(\n", - " name=\"encode_city_name\",\n", - " version=1,\n", - ")\n", - "\n", - "# Retrieve scale_pm2_5 transformation function\n", - "scaler = fs.get_transformation_function(\n", - " name=\"scale_pm2_5\",\n", - " version=1,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "54875b4a", - "metadata": {}, - "source": [ - "In Hopsworks Feature Store, a Query object allows you to select specific features from a feature group.\n", - "\n", - "`feature_group.select_except(['date'])` selects all columns from the feature group except for the 'date' column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "596fa8e0", - "metadata": {}, - "outputs": [], - "source": [ - "# Select features for training data\n", - "selected_features = feature_group.select_except(['date'])\n", - "\n", - "# Uncomment this if you would like to view your selected features\n", - "# selected_features.show(5)" - ] - }, - { - "cell_type": "markdown", - "id": "83db5e88", - "metadata": {}, - "source": [ - "After creating the Query object, you will create a feature view.\n", - "\n", - "A feature view is a logical representation of data which can be used for real-time serving or batch processing. \n", - "\n", - "You can read more about **Feature Views** [here](https://docs.hopsworks.ai/3.2/concepts/fs/feature_view/fv_overview/)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "26622c9c", - "metadata": {}, - "outputs": [], - "source": [ - "# Get or create a feature view\n", - "feature_view = fs.get_or_create_feature_view(\n", - " name='serving_fv',\n", - " version=1,\n", - " query=selected_features,\n", - " # Apply your custom transformation functions to necessary columns\n", - " transformation_functions={\n", - " \"city_name\": encoder,\n", - " \"pm2_5\": scaler,\n", - " },\n", - " labels=['target'],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "ccd708ba", - "metadata": {}, - "source": [ - "## ๐Ÿ‹๏ธ Training Dataset Creation\n", - "The next step is to create the train-test split of your data.\n", - "\n", - "Let's clarify the next parameters of the `.train_test_split()` method:\n", - "\n", - "- test_size=0.1: This parameter specifies the size of the test set relative to the entire dataset. In this case, the test set will contain 10% of the data, and the train set will have the remaining 90%.\n", - "\n", - "- description='Description of the dataset': A brief description provided for the train-test split dataset, explaining its purpose or any other relevant information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53b26f67", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a train-test split dataset\n", - "X_train, X_test, y_train, y_test = feature_view.train_test_split(\n", - " test_size=0.1,\n", - " description='Description of the dataset',\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30503127", - "metadata": {}, - "outputs": [], - "source": [ - "X_train.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92db837f", - "metadata": {}, - "outputs": [], - "source": [ - "y_train.head(3)" - ] - }, - { - "cell_type": "markdown", - "id": "248bc5d1", - "metadata": {}, - "source": [ - "\n", - "## ๐Ÿงฌ Modeling\n", - "\n", - "As a machine learning algorithm you will use the XGBClassifier.\n", - "\n", - "Let's initialize it, fit on train data and then evaluate using Accuracy Score." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f1346a0", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize XGBClassifier\n", - "xgb_classifier = xgb.XGBClassifier()\n", - "\n", - "# Fit the classifier\n", - "xgb_classifier.fit(X_train, y_train)\n", - "\n", - "# Evaluate the model\n", - "y_pred = xgb_classifier.predict(X_test)\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "print(\"๐Ÿ‘ฎ๐Ÿปโ€โ™‚๏ธ Accuracy:\", accuracy)" - ] - }, - { - "cell_type": "markdown", - "id": "b230ca05", - "metadata": {}, - "source": [ - "## ๐Ÿ—„ Model Registry\n", - "\n", - "In Hopsworks, the Model Registry is a crucial component used to manage and version machine learning models. It acts as a centralized repository where trained models can be stored, tracked, and shared among team members.\n", - "\n", - "By calling `project.get_model_registry()`, the code retrieves a reference to the Model Registry associated with the current Hopsworks project. This reference allows the user to interact with the Model Registry and perform operations such as registering, versioning, and accessing trained machine learning models.\n", - "With the Model Registry, data scientists and machine learning engineers can effectively collaborate, track model changes, and easily deploy the best-performing models to production environments." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67616e13", - "metadata": {}, - "outputs": [], - "source": [ - "mr = project.get_model_registry()" - ] - }, - { - "cell_type": "markdown", - "id": "f50bed8a", - "metadata": {}, - "source": [ - "### โš™๏ธ Model Schema\n", - "\n", - "The next step is to **define input and output schema** of a machine learning model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8f9569dc", - "metadata": {}, - "outputs": [], - "source": [ - "from hsml.schema import Schema\n", - "from hsml.model_schema import ModelSchema\n", - "\n", - "input_schema = Schema(X_train.values)\n", - "output_schema = Schema(y_train)\n", - "model_schema = ModelSchema(\n", - " input_schema=input_schema,\n", - " output_schema=output_schema,\n", - ")\n", - "\n", - "model_schema.to_dict()" - ] - }, - { - "cell_type": "markdown", - "id": "8de1abf1", - "metadata": {}, - "source": [ - "\n", - "### ๐Ÿ’พ Saving the Model\n", - "\n", - "Now you are ready to register your model in the Hopsworks Moder Registry.\n", - "\n", - "To begin with, let's create the `xgb_model` model directory and save the trained model in this directory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2f790036", - "metadata": {}, - "outputs": [], - "source": [ - "model_dir = \"xgb_model\"\n", - "\n", - "if os.path.isdir(model_dir) == False:\n", - " os.mkdir(model_dir)\n", - "\n", - "# Save the model\n", - "xgb_classifier.save_model(model_dir + \"/model.json\")" - ] - }, - { - "cell_type": "markdown", - "id": "dfbba175", - "metadata": {}, - "source": [ - "To register your model in the Hopsworks model registry you can use `.create_model()` method with the next parameters:\n", - "\n", - "- name=\"xgb_model\": The name of the model.\n", - "\n", - "- metrics={\"Accuracy\": accuracy}: The model's performance metrics are specified as a dictionary, with \"Accuracy\" as the key and the value being the accuracy score computed earlier in the code. This metric represents the accuracy of the model's predictions on the test data.\n", - "\n", - "- description=\"XGB model\": A brief description of the model.\n", - "\n", - "- input_example=X_train.sample(): An example input from the training data (X_train) is used to demonstrate the expected format of the model's input data. It is randomly sampled from X_train.\n", - "\n", - "- model_schema=model_schema: The model schema, which represents the data input and output structure of the model, is specified using the previously defined model_schema." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5adae94c", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a model in the model registry\n", - "model = mr.python.create_model(\n", - " name=\"xgb_model\",\n", - " metrics={\"Accuracy\": accuracy}, \n", - " description=\"XGB model\",\n", - " input_example=X_train.sample(),\n", - " model_schema=model_schema,\n", - ")\n", - "\n", - "model.save(model_dir)" - ] - }, - { - "cell_type": "markdown", - "id": "9d416af2", - "metadata": {}, - "source": [ - "---\n", - "\n", - "# โ›ณ๏ธ Inference Pipeline \n", - "\n", - "In the **Inference Pipeline** section, you will retrieve your model from Hopsworks Model Registry and utilize this model to make predictions on both Batch Data and Online Feature Vectors." - ] - }, - { - "cell_type": "markdown", - "id": "d8c45f8e", - "metadata": {}, - "source": [ - "\n", - "## ๐Ÿ“ฎ Retrieving the Model from Model Registry \n", - "\n", - "To retrieve a previously registered machine learning model from the Hopsworks Model Registry you need to use the `.get_model()` method with the next parameters:\n", - "\n", - "- name=\"xgb_model\": The name of the model to be retrieved.\n", - "\n", - "- version=1: The version number of the model to be retrieved.\n", - "\n", - "Then you will download the model from the Model Registry." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "42290763", - "metadata": {}, - "outputs": [], - "source": [ - "# Retrieve your model from the model registry\n", - "retrieved_model = mr.get_model(\n", - " name=\"xgb_model\",\n", - " version=1,\n", - ")\n", - "saved_model_dir = retrieved_model.download()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28d78f36", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize the model\n", - "model = xgb.XGBClassifier()\n", - "\n", - "# Load the model from a saved JSON file\n", - "model.load_model(saved_model_dir + \"/model.json\")\n", - "model" - ] - }, - { - "cell_type": "markdown", - "id": "bf8d901d", - "metadata": {}, - "source": [ - "\n", - "## ๐Ÿ‘จ๐Ÿปโ€โš–๏ธ Batch Prediction \n", - "\n", - "Batch prediction is a process in which a trained machine learning model is used to make predictions on a large set of data all at once." - ] - }, - { - "cell_type": "markdown", - "id": "9098714a", - "metadata": {}, - "source": [ - "To retrieve batch data from the feature view you need to use `init_batch_scoring` method of the feature view object.\n", - "\n", - "`training_dataset_version` parameter specifies the version number of the training dataset that will be used for scoring.\n", - "\n", - "Then you can use the `.get_batch_data()` method to retrieve batch data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b8be1550", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialise feature view to retrieve batch data\n", - "feature_view.init_batch_scoring(1)\n", - "\n", - "# Retrieve batch data\n", - "batch_data = feature_view.get_batch_data()\n", - "batch_data.head(3)" - ] - }, - { - "cell_type": "markdown", - "id": "1bcf497b", - "metadata": {}, - "source": [ - "Now let's use retrieved model to predict batch data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c930266d", - "metadata": {}, - "outputs": [], - "source": [ - "# Predict batch data using retrieved model\n", - "predictions_batch = model.predict(batch_data)\n", - "predictions_batch[:10]" - ] - }, - { - "cell_type": "markdown", - "id": "93db8b23", - "metadata": {}, - "source": [ - "\n", - "## ๐Ÿ‘จ๐Ÿปโ€โš–๏ธ Real-time Predictions\n", - "\n", - "**Real-time Predictions** is a process of using a trained machine learning model to make predictions on feature vector(s) in real-time. \n", - "\n", - "To begin with, let's create `to_numpy` function which will transform a feature vector(s) list into a numpy array with a proper shape." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a52389de", - "metadata": {}, - "outputs": [], - "source": [ - "def to_numpy(feature_vector):\n", - " \"\"\"\n", - " Converts a given feature vector into a NumPy array with a shape of (-1, 2).\n", - "\n", - " Parameters:\n", - " vector (list or array-like): The input vector to be converted.\n", - "\n", - " Returns:\n", - " numpy.ndarray: A NumPy array with a shape of (-1, 2) containing the elements from the input vector.\n", - " \"\"\"\n", - " return np.array(feature_vector).reshape(-1,2)" - ] - }, - { - "cell_type": "markdown", - "id": "e50f54a7", - "metadata": {}, - "source": [ - "The next step is to initialize the feature view for serving and then retrieve a feature vector with specified primary keys." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b8f9595f", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialise feature view to retrieve feature vector\n", - "feature_view.init_serving(1)\n", - "\n", - "# Retrieve a feature vector\n", - "feature_vector = feature_view.get_feature_vector(\n", - " entry = {\n", - " \"city_name\": 'Amsterdam',\n", - " \"date\": '2013-01-01',\n", - " }\n", - ")\n", - "feature_vector" - ] - }, - { - "cell_type": "markdown", - "id": "9c659edd", - "metadata": {}, - "source": [ - "Now you can use your model to predict the feature vector." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6b6aa6c1", - "metadata": {}, - "outputs": [], - "source": [ - "# Predict feature vector using retrieved model\n", - "prediction_feature_vector = model.predict(to_numpy(feature_vector))\n", - "prediction_feature_vector" - ] - }, - { - "cell_type": "markdown", - "id": "dd1e7328", - "metadata": {}, - "source": [ - "In addition, you can retrieve several feature vectors. Just pass primary keys as a list of dictionaries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9d8fbc8", - "metadata": {}, - "outputs": [], - "source": [ - "# Retrieve feature vectors from feature store\n", - "feature_vectors = feature_view.get_feature_vectors(\n", - " entry = [\n", - " {\"city_name\": 'Amsterdam', \"date\": '2013-01-01'},\n", - " {\"city_name\": 'Amsterdam', \"date\": '2014-01-01'},\n", - " {\"city_name\": 'Amsterdam', \"date\": '2015-01-01'},\n", - " {\"city_name\": 'Amsterdam', \"date\": '2016-01-01'},\n", - " ]\n", - ")\n", - "feature_vectors" - ] - }, - { - "cell_type": "markdown", - "id": "ccfce535", - "metadata": {}, - "source": [ - "Now you can use your model to predict feature vectors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8db998a2", - "metadata": {}, - "outputs": [], - "source": [ - "# Predict feature vectors using retrieved model\n", - "prediction_feature_vectors = model.predict(to_numpy(feature_vectors))\n", - "prediction_feature_vectors" - ] - }, - { - "cell_type": "markdown", - "id": "0c202c74", - "metadata": {}, - "source": [ - "---" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/advanced_tutorials/transformation_functions/custom/features/transactions_fraud.py b/advanced_tutorials/transformation_functions/custom/features/transactions_fraud.py new file mode 100644 index 00000000..bbe8f33e --- /dev/null +++ b/advanced_tutorials/transformation_functions/custom/features/transactions_fraud.py @@ -0,0 +1,122 @@ +from math import radians +import numpy as np +import pandas as pd +from typing import Union + +def haversine(long: pd.Series, lat: pd.Series, shift: int) -> np.ndarray: + """ + Compute Haversine distance between each consecutive coordinate in (long, lat). + + Parameters: + - long: pandas Series, longitude values + - lat: pandas Series, latitude values + - shift: int, the number of positions to shift for calculating distances + + Returns: + - numpy array, Haversine distances + """ + long_shifted = long.shift(shift) + lat_shifted = lat.shift(shift) + long_diff = long_shifted - long + lat_diff = lat_shifted - lat + + a = np.sin(lat_diff/2.0)**2 + b = np.cos(lat) * np.cos(lat_shifted) * np.sin(long_diff/2.0)**2 + c = 2*np.arcsin(np.sqrt(a + b)) + + return c + + +def time_delta(datetime_value: pd.Series, shift: int) -> pd.Series: + """ + Compute time difference between each consecutive transaction. + + Parameters: + - datetime_value: pandas Series, datetime values + - shift: int, the number of positions to shift for calculating time differences + + Returns: + - pandas Series, time differences + """ + time_shifted = datetime_value.shift(shift) + return time_shifted + + +def calculate_loc_delta_t_plus_1(df: pd.DataFrame) -> pd.DataFrame: + """ + Calculate loc_delta_t_plus_1 for each group. + + Parameters: + - group: pandas DataFrame group, grouped by 'cc_num' + + Returns: + - pandas Series, loc_delta_t_plus_1 values + """ + df["loc_delta_t_plus_1"] = df.groupby("cc_num").apply( + lambda x: haversine(x["longitude"], x["latitude"], 1) + ).reset_index(level=0, drop=True).fillna(0) + return df + + +def calculate_loc_delta_t_minus_1(df: pd.DataFrame) -> pd.DataFrame: + """ + Calculate loc_delta_t_minus_1 for each group. + + Parameters: + - group: pandas DataFrame group, grouped by 'cc_num' + + Returns: + - pandas Series, loc_delta_t_minus_1 values + """ + df["loc_delta_t_minus_1"] = df.groupby("cc_num").apply( + lambda x: haversine(x["longitude"], x["latitude"], -1) + ).reset_index(level=0, drop=True).fillna(0) + return df + + +def calculate_time_delta_t_minus_1(df: pd.DataFrame) -> pd.DataFrame: + """ + Calculate time_delta_t_minus_1 for each group. + + Parameters: + - group: pandas DataFrame group, grouped by 'cc_num' + + Returns: + - pandas Series, time_delta_t_minus_1 values + """ + df["time_delta_t_minus_1"] = df.groupby("cc_num").apply(lambda x: time_delta(x["datetime"], -1))\ + .reset_index(level=0, drop=True) + return df + + +def prepare_transactions_fraud(trans_df: pd.DataFrame) -> pd.DataFrame: + """ + Prepare transaction data with engineered features for fraud detection. + + Parameters: + - trans_df: pandas DataFrame, transaction data + + Returns: + - pandas DataFrame, prepared transaction data with engineered features + """ + # Sort values and convert latitude and longitude to radians + trans_df.sort_values("datetime", inplace=True) + trans_df[["longitude", "latitude"]] = trans_df[["longitude", "latitude"]].applymap(radians) + + # Calculate loc_delta_t_plus_1, loc_delta_t_minus_1, and time_delta_t_minus_1 using groupby + trans_df = calculate_loc_delta_t_plus_1(trans_df) + + trans_df = calculate_loc_delta_t_minus_1(trans_df) + + trans_df = calculate_time_delta_t_minus_1(trans_df) + + # Normalize time_delta_t_minus_1 to days and handle missing values + trans_df["time_delta_t_minus_1"] = (trans_df["time_delta_t_minus_1"] - trans_df["datetime"]) / np.timedelta64(1, 'D') + trans_df["time_delta_t_minus_1"] = trans_df["time_delta_t_minus_1"].fillna(0) + + # Select relevant columns, drop duplicates, and reset index + trans_df = trans_df[["tid", "datetime", "cc_num", "amount", "country", "fraud_label", + "loc_delta_t_plus_1", "loc_delta_t_minus_1", "time_delta_t_minus_1", "birthdate"]] + trans_df = trans_df.drop_duplicates(subset=['cc_num', 'datetime']).reset_index(drop=True) + + return trans_df \ No newline at end of file diff --git a/advanced_tutorials/transformation_functions/custom/images/fv_mdt_odt.png b/advanced_tutorials/transformation_functions/custom/images/fv_mdt_odt.png new file mode 100644 index 00000000..f251ac9b Binary files /dev/null and b/advanced_tutorials/transformation_functions/custom/images/fv_mdt_odt.png differ diff --git a/advanced_tutorials/transformation_functions/custom/images/on_demand_example.png b/advanced_tutorials/transformation_functions/custom/images/on_demand_example.png new file mode 100644 index 00000000..94a0a8c7 Binary files /dev/null and b/advanced_tutorials/transformation_functions/custom/images/on_demand_example.png differ diff --git a/advanced_tutorials/transformation_functions/custom/transformations.py b/advanced_tutorials/transformation_functions/custom/transformations.py deleted file mode 100644 index 6d261a96..00000000 --- a/advanced_tutorials/transformation_functions/custom/transformations.py +++ /dev/null @@ -1,48 +0,0 @@ -def encode_city_name(city_name): - """ - Encode the 'city_name' element using a custom LabelEncoder-like approach. - - Parameters: - city_name (str): - The 'city_name' element to be encoded. - - Returns: - int: - The encoded city name as an integer. - """ - # Define a mapping of city names to their corresponding integer labels - label_map = { - 'Amsterdam': 0, 'Athina': 1, 'Berlin': 2, 'Gdansk': 3, 'Krakรณw': 4, - 'London': 5, 'Madrid': 6, 'Marseille': 7, 'Milano': 8, 'Mรผnchen': 9, - 'Napoli': 10, 'Paris': 11, 'Sevilla': 12, 'Stockholm': 13, 'Tallinn': 14, - 'Varna': 15, 'Wien': 16 - } - - # Return the integer label for the input city_name using label_map.get() - # If the city_name is not found in label_map, return -1 as the default value - return label_map.get(city_name, -1) - - -def scale_pm2_5(pm2_5_value): - """ - Scale the 'pm2_5' value using custom scaling. - - Parameters: - pm2_5_value (float): - The 'pm2_5' value to be scaled. - - Returns: - float: - The scaled 'pm2_5' value. - """ - # Define the mean value of the 'pm2_5' column for the scaling process - mean = 14.5 - - # Define the standard deviation value of the 'pm2_5' column for the scaling process - std = 13 - - # Calculate the scaled 'pm2_5' value using the custom scaling formula - scaled_pm2_5 = (pm2_5_value - mean) / std - - # Return the scaled 'pm2_5' value - return scaled_pm2_5 diff --git a/advanced_tutorials/transformation_functions/keras/keras_transformation_functions.ipynb b/advanced_tutorials/transformation_functions/keras/keras_transformation_functions.ipynb index 42867c9f..8ec32f72 100644 --- a/advanced_tutorials/transformation_functions/keras/keras_transformation_functions.ipynb +++ b/advanced_tutorials/transformation_functions/keras/keras_transformation_functions.ipynb @@ -449,7 +449,7 @@ "outputs": [], "source": [ "# Create an instance of the OneHotEncoder and StandardScaler\n", - "one_hot_encoder = OneHotEncoder(sparse=False)\n", + "one_hot_encoder = OneHotEncoder(sparse_output=False)\n", "standard_scaler = StandardScaler()" ] }, @@ -1078,7 +1078,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.12.5" } }, "nbformat": 4, diff --git a/advanced_tutorials/transformation_functions/pytorch/pytorch_transformation_functions.ipynb b/advanced_tutorials/transformation_functions/pytorch/pytorch_transformation_functions.ipynb index dbd4b334..f7dec167 100644 --- a/advanced_tutorials/transformation_functions/pytorch/pytorch_transformation_functions.ipynb +++ b/advanced_tutorials/transformation_functions/pytorch/pytorch_transformation_functions.ipynb @@ -451,7 +451,7 @@ "outputs": [], "source": [ "# Create an instance of the OneHotEncoder and StandardScaler\n", - "one_hot_encoder = OneHotEncoder(sparse=False)\n", + "one_hot_encoder = OneHotEncoder(sparse_output=False)\n", "standard_scaler = StandardScaler()" ] }, @@ -1158,7 +1158,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.12.5" } }, "nbformat": 4, diff --git a/api_examples/hsfs/feature_monitoring/feature_monitoring.ipynb b/api_examples/hsfs/feature_monitoring/feature_monitoring.ipynb index ab340829..3586b081 100644 --- a/api_examples/hsfs/feature_monitoring/feature_monitoring.ipynb +++ b/api_examples/hsfs/feature_monitoring/feature_monitoring.ipynb @@ -201,9 +201,9 @@ "\n", "min_max_scaler = fs.get_transformation_function(name=\"min_max_scaler\")\n", "\n", - "transformation_functions = {\n", - " \"amount\": min_max_scaler,\n", - "}\n", + "transformation_functions = [\n", + " min_max_scaler(\"amount\") \n", + "]\n", "\n", "trans_fv = fs.create_feature_view(\n", " name=\"trans_fv\",\n", diff --git a/churn/2_churn_training_pipeline.ipynb b/churn/2_churn_training_pipeline.ipynb index 0d34eddd..37461599 100644 --- a/churn/2_churn_training_pipeline.ipynb +++ b/churn/2_churn_training_pipeline.ipynb @@ -167,15 +167,15 @@ "]\n", "\n", "# Map features to their corresponding transformation functions\n", - "transformation_functions = {}\n", + "transformation_functions = []\n", "\n", "# For numerical features, use the min_max_scaler transformation\n", "for feature in numerical_features:\n", - " transformation_functions[feature] = min_max_scaler\n", + " transformation_functions.append(min_max_scaler(feature))\n", "\n", "# For categorical features, use the label_encoder transformation\n", "for feature in categorical_features:\n", - " transformation_functions[feature] = label_encoder" + " transformation_functions.append(label_encoder(feature)) " ] }, { diff --git a/churn/3_churn_batch_inference.ipynb b/churn/3_churn_batch_inference.ipynb index c907d5a1..a87b743a 100644 --- a/churn/3_churn_batch_inference.ipynb +++ b/churn/3_churn_batch_inference.ipynb @@ -243,23 +243,27 @@ "metadata": {}, "outputs": [], "source": [ - "import inspect \n", - "\n", - "# Recall that you applied transformation functions, such as min max scaler and laber encoder. \n", - "# Now you want to transform them back to human readable format.\n", "df_all = batch_data.copy()\n", - "td_transformation_functions = feature_view._batch_scoring_server._transformation_functions\n", - "for feature_name in td_transformation_functions:\n", - " td_transformation_function = td_transformation_functions[feature_name]\n", - " sig, foobar_locals = inspect.signature(td_transformation_function.transformation_fn), locals()\n", - " param_dict = dict([(param.name, param.default) for param in sig.parameters.values() if param.default != inspect._empty])\n", - " if td_transformation_function.name == \"label_encoder\":\n", - " rev_dict = {v: k for k, v in param_dict[\"value_to_index\"].items()}\n", - " df_all[feature_name] = df_all[feature_name].map(lambda x: rev_dict[x])\n", - " if td_transformation_function.name == \"min_max_scaler\":\n", - " df_all[feature_name] = df_all[feature_name].map(lambda x: x*(param_dict[\"max_value\"]-param_dict[\"min_value\"])+param_dict[\"min_value\"])\n", - "\n", - " \n", + "\n", + "fv_transformation_functions = feature_view._batch_scoring_server.model_dependent_transformation_functions\n", + "\n", + "for transformation_function in fv_transformation_functions:\n", + " udf = transformation_function.hopsworks_udf\n", + " if udf.function_name == \"min_max_scaler\":\n", + " transformed_features = udf.transformation_features[0]\n", + " transformed_feature_name = udf.output_column_names[0]\n", + " stats = udf.transformation_statistics\n", + " df_all[transformed_features] = df_all[transformed_feature_name].map(lambda x: x*(stats.feature.max-stats.feature.min)+stats.feature.min)\n", + " \n", + " \n", + " if udf.function_name == \"label_encoder\":\n", + " transformed_features = udf.transformation_features[0]\n", + " transformed_feature_name = udf.output_column_names[0]\n", + " stats = udf.transformation_statistics\n", + " unique_data = sorted([value for value in stats.feature.unique_values])\n", + " index_to_value = {index: value for index, value in enumerate(unique_data)}\n", + " df_all[transformed_features] = df_all[transformed_feature_name].map(lambda x: index_to_value[x])\n", + "\n", "df_all = df_all\n", "df_all['Churn'] = predictions\n", "df_all.head()" @@ -537,7 +541,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -551,7 +555,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.12.5" } }, "nbformat": 4, diff --git a/fraud_batch/2_fraud_batch_training_pipeline.ipynb b/fraud_batch/2_fraud_batch_training_pipeline.ipynb index 2bc7e8e4..1ae4b6b3 100644 --- a/fraud_batch/2_fraud_batch_training_pipeline.ipynb +++ b/fraud_batch/2_fraud_batch_training_pipeline.ipynb @@ -159,9 +159,9 @@ "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n", "\n", "# Map features to transformations.\n", - "transformation_functions = {\n", - " \"category\": label_encoder,\n", - "}" + "transformation_functions = [\n", + " label_encoder(\"category\"),\n", + "]" ] }, { diff --git a/fraud_online/2_fraud_online_training_pipeline.ipynb b/fraud_online/2_fraud_online_training_pipeline.ipynb index 90f5a113..d0bb35a8 100644 --- a/fraud_online/2_fraud_online_training_pipeline.ipynb +++ b/fraud_online/2_fraud_online_training_pipeline.ipynb @@ -164,10 +164,10 @@ "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n", "\n", "# Map features to transformation functions.\n", - "transformation_functions = {\n", - " \"country\": label_encoder,\n", - " \"gender\": label_encoder,\n", - "}" + "transformation_functions = [\n", + " label_encoder(\"country\"),\n", + " label_encoder(\"gender\"),\n", + "]" ] }, { diff --git a/integrations/mage_ai/mage_tutorial/custom/transactions_feature_view.py b/integrations/mage_ai/mage_tutorial/custom/transactions_feature_view.py index 9bb1da66..f5e9bfe0 100644 --- a/integrations/mage_ai/mage_tutorial/custom/transactions_feature_view.py +++ b/integrations/mage_ai/mage_tutorial/custom/transactions_feature_view.py @@ -43,9 +43,9 @@ def transform_custom(*args, **kwargs): label_encoder = fs.get_transformation_function(name="label_encoder") # Map features to transformations. - transformation_functions = { - "category": label_encoder, - } + transformation_functions = [ + label_encoder("category"), + ] # Get or create the 'transactions_view' feature view feature_view = fs.get_or_create_feature_view( diff --git a/integrations/mage_ai/mage_tutorial/data_exporters/transactions_feature_view.py b/integrations/mage_ai/mage_tutorial/data_exporters/transactions_feature_view.py index a8c61b6e..00ab82d7 100644 --- a/integrations/mage_ai/mage_tutorial/data_exporters/transactions_feature_view.py +++ b/integrations/mage_ai/mage_tutorial/data_exporters/transactions_feature_view.py @@ -41,9 +41,9 @@ def create_feature_view(data, *args, **kwargs): label_encoder = fs.get_transformation_function(name="label_encoder") # Map features to transformations. - transformation_functions = { - "category": label_encoder, - } + transformation_functions = [ + label_encoder("category"), + ] # Get or create the 'transactions_view' feature view feature_view = fs.get_or_create_feature_view( diff --git a/integrations/neo4j/2_training_pipeline.ipynb b/integrations/neo4j/2_training_pipeline.ipynb index e718d173..55309c48 100644 --- a/integrations/neo4j/2_training_pipeline.ipynb +++ b/integrations/neo4j/2_training_pipeline.ipynb @@ -149,16 +149,16 @@ "min_max_scaler = fs.get_transformation_function(name=\"min_max_scaler\")\n", "\n", "# Map features to transformations.\n", - "transformation_functions = {\n", - " \"monthly_in_count\": min_max_scaler,\n", - " \"monthly_in_total_amount\": min_max_scaler,\n", - " \"monthly_in_mean_amount\": min_max_scaler,\n", - " \"monthly_in_std_amount\": min_max_scaler,\n", - " \"monthly_out_count\": min_max_scaler,\n", - " \"monthly_out_total_amount\": min_max_scaler,\n", - " \"monthly_out_mean_amount\": min_max_scaler,\n", - " \"monthly_out_std_amount\": min_max_scaler,\n", - "}" + "transformation_functions = [\n", + " min_max_scaler(\"monthly_in_count\"),\n", + " min_max_scaler(\"monthly_in_total_amount\"),\n", + " min_max_scaler(\"monthly_in_mean_amount\"),\n", + " min_max_scaler(\"monthly_in_std_amount\"),\n", + " min_max_scaler(\"monthly_out_count\"),\n", + " min_max_scaler(\"monthly_out_total_amount\"),\n", + " min_max_scaler(\"monthly_out_mean_amount\"),\n", + " min_max_scaler(\"monthly_out_std_amount\"),\n", + "]" ] }, { diff --git a/integrations/polars/quickstart_polars.ipynb b/integrations/polars/quickstart_polars.ipynb index efc53e80..17f66b06 100644 --- a/integrations/polars/quickstart_polars.ipynb +++ b/integrations/polars/quickstart_polars.ipynb @@ -474,9 +474,9 @@ "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n", "\n", "# Map features to transformations.\n", - "transformation_functions = {\n", - " \"category\": label_encoder,\n", - "}" + "transformation_functions = [\n", + " label_encoder(\"category\"),\n", + "]" ] }, { diff --git a/integrations/pyspark_streaming/2_training_pipeline.ipynb b/integrations/pyspark_streaming/2_training_pipeline.ipynb index 42d7667e..7e93195f 100644 --- a/integrations/pyspark_streaming/2_training_pipeline.ipynb +++ b/integrations/pyspark_streaming/2_training_pipeline.ipynb @@ -161,9 +161,9 @@ "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n", "\n", "# Map features to transformations.\n", - "transformation_functions = {\n", - " \"category\": label_encoder,\n", - "}" + "transformation_functions = [\n", + " label_encoder(\"category\") ,\n", + "]" ] }, { diff --git a/integrations/wandb/2_feature_view_creation.ipynb b/integrations/wandb/2_feature_view_creation.ipynb index a2b68e40..5c1c073b 100755 --- a/integrations/wandb/2_feature_view_creation.ipynb +++ b/integrations/wandb/2_feature_view_creation.ipynb @@ -111,15 +111,15 @@ "\n", "# Map features to transformations.\n", "transformation_functions = {\n", - " \"category\": label_encoder,\n", - " \"amount\": min_max_scaler,\n", - " \"trans_volume_mavg\": min_max_scaler,\n", - " \"trans_volume_mstd\": min_max_scaler,\n", - " \"trans_freq\": min_max_scaler,\n", - " \"loc_delta\": min_max_scaler,\n", - " \"loc_delta_mavg\": min_max_scaler,\n", - " \"age_at_transaction\": min_max_scaler,\n", - " \"days_until_card_expires\": min_max_scaler,\n", + " label_encoder(\"category\"),\n", + " min_max_scaler(\"amount\"),\n", + " min_max_scaler(\"trans_volume_mavg\"),\n", + " min_max_scaler(\"trans_volume_mstd\"),\n", + " min_max_scaler(\"trans_freq\"),\n", + " min_max_scaler(\"loc_delta\"),\n", + " min_max_scaler(\"loc_delta_mavg\"),\n", + " min_max_scaler(\"age_at_transaction\"),\n", + " min_max_scaler(\"days_until_card_expires\"),\n", "}" ] }, diff --git a/quickstart.ipynb b/quickstart.ipynb index dd97f88a..71eb58f6 100644 --- a/quickstart.ipynb +++ b/quickstart.ipynb @@ -525,9 +525,9 @@ "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n", "\n", "# Map features to transformations.\n", - "transformation_functions = {\n", - " \"category\": label_encoder,\n", - "}" + "transformation_functions = [\n", + " label_encoder(\"category\"),\n", + "]" ] }, { @@ -907,7 +907,7 @@ " def predict(self, inputs):\n", " \"\"\" Serves a prediction request usign a trained model\"\"\"\n", " feature_vector = self.fv.get_feature_vector({\"cc_num\": inputs[0][0]})\n", - " feature_vector = feature_vector[:-1]\n", + " feature_vector = feature_vector[:-2] + feature_vector[-1:]\n", " \n", " return self.model.predict(np.asarray(feature_vector).reshape(1, -1)).tolist() # Numpy Arrays are not JSON serializable" ] @@ -1134,7 +1134,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.12.5" }, "widgets": { "application/vnd.jupyter.widget-state+json": {