From 464fad63fb46af5ae32ea63c0fedff8619a37312 Mon Sep 17 00:00:00 2001 From: Rupeekshan Maheswaran Date: Mon, 20 Mar 2023 20:12:46 +0530 Subject: [PATCH 1/6] add notebook of FS basic workflow --- 6 Feature Store: Basic Workflow.ipynb | 377 ++++++++++++++++++++++++++ 1 file changed, 377 insertions(+) create mode 100644 6 Feature Store: Basic Workflow.ipynb diff --git a/6 Feature Store: Basic Workflow.ipynb b/6 Feature Store: Basic Workflow.ipynb new file mode 100644 index 0000000..419dd2b --- /dev/null +++ b/6 Feature Store: Basic Workflow.ipynb @@ -0,0 +1,377 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Feature Store - A unified storage of curated features\n", + "\n", + "This notebook is intended to help you get started with Feature Store in the H2O AI Cloud using python.\n", + "\n", + "* **Product Documentation:** https://docs.h2o.ai/feature-store/latest-stable/docs/index.html" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Prerequistes\n", + "\n", + "Update the `h2o_ai_cloud.py` file with the connection parameters for your H2O AI Cloud environemnt:\n", + "1. Login to your H2O AI Cloud environment\n", + "1. Click your username or avatar in the H2O AI Cloud navigation bar\n", + "1. Navigate to `CLI & API Access`\n", + "1. Use the variables from the `Accessing H2O AI Cloud APIs` section to populate the parameters" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "from getpass import getpass\n", + "\n", + "from h2o_ai_cloud import token_provider, fs_client\n", + "from featurestore import CSVFile, Schema\n", + "\n", + "from pyspark.sql import SparkSession" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Securely connect to the platform\n", + "We first connect to the H2O AI Cloud using our platform token to create a token provider object. We can then use this object to log into Feature Store." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "client = fs_client(token_provider())" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Understand the environment" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "client.get_version()" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Configure spark for Feature Store " + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [ + "spark = SparkSession.builder \\\n", + " .master(\"local\") \\\n", + " .config(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.12.238,io.delta:delta-core_2.12:1.2.1\") \\\n", + " .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\") \\\n", + " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\") \\\n", + " .getOrCreate()\n", + "\n", + "spark.sparkContext.setLogLevel(\"ERROR\")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Define data source \n", + "Feature Store supports different data sources - https://docs.h2o.ai/feature-store/latest-stable/docs/supported_data_sources.html" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 27, + "source": [ + "source = CSVFile(\"s3a://h2o-public-test-data/smalldata/gbm_test/titanic.csv\")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Extract schema from data source\n", + "The schema represents the features of the feature set" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "schema = client.extract_schema_from_source(source)" + ], + "outputs": [], + "metadata": { + "tags": [] + } + }, + { + "cell_type": "markdown", + "source": [ + "## Create a project\n", + "User can follow naming conventions mentioned in here - https://docs.h2o.ai/feature-store/latest-stable/docs/api/naming_conventions.html" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 29, + "source": [ + "project = client.projects.create(\"sample_project\")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Create a feature set" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 30, + "source": [ + "feature_set = project.feature_sets.register(schema, \"sample_fs\")" + ], + "outputs": [], + "metadata": { + "tags": [] + } + }, + { + "cell_type": "markdown", + "source": [ + "## Ingest data from source\n", + "Uploading data into Feature Store" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "feature_set.ingest(source)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Retrieve the data" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 32, + "source": [ + "reference = feature_set.retrieve()" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Download features\n", + "Download the files from Feature Store" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "reference.download()" + ], + "outputs": [], + "metadata": { + "tags": [] + } + }, + { + "cell_type": "markdown", + "source": [ + "## Obtain data as a Spark Frame \n", + "Download features as spark dataframe" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "reference.as_spark_frame(spark).show()" + ], + "outputs": [], + "metadata": { + "tags": [] + } + }, + { + "cell_type": "markdown", + "source": [ + "### Prepare a schema from a string\n", + "Schema can be created from a string format" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 35, + "source": [ + "schema_str = \"id integer, value string\"\n", + "schema = Schema.create_from(schema_str)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Create another feature set" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 36, + "source": [ + "fs_online = project.feature_sets.register(schema, \"sample_fs_online\", primary_key=\"id\")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Ingest data from Online Feature Store" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 37, + "source": [ + "fs_online.ingest_online('{\"id\": 1, \"value\": \"test\"}')" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Retrieve data from Online Feature Store" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 10, + "source": [ + "fs_online.retrieve_online(1)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Delete a feature set" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 8, + "source": [ + "fs_online.delete()" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Delete a project" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 9, + "source": [ + "project.delete()" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Clean up" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "!rm " + ], + "outputs": [], + "metadata": {} + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.16 64-bit ('3.8.16': pyenv)" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + }, + "interpreter": { + "hash": "b5803137338cb19a16337a87a205be3b478f8cca74095ec6d83bca1ed0847cec" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file From 47d72debe861e3988f4c3025fcb406e44594a764 Mon Sep 17 00:00:00 2001 From: Rupeekshan Maheswaran Date: Mon, 20 Mar 2023 20:13:06 +0530 Subject: [PATCH 2/6] add FS client --- h2o_ai_cloud.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/h2o_ai_cloud.py b/h2o_ai_cloud.py index cae4c84..a0ab257 100644 --- a/h2o_ai_cloud.py +++ b/h2o_ai_cloud.py @@ -3,6 +3,7 @@ import h2o_authn import h2o_mlops_client import h2osteam +import featurestore # The URL you use to access the H2O AI Cloud's UI - do not include the `https://` - ex: cloud.h2o.ai @@ -44,7 +45,6 @@ def mlops_client(): ) - def steam_client(): """ Connect to Enterprise Steam, Driverless AI, and H2O-3 @@ -58,3 +58,15 @@ def steam_client(): ) +def fs_client(token_provider): + """ + Connect to Feature Store + """ + FS_API = "https://featurestore." + H2O_CLOUD_URL + + client = featurestore.Client( + url=FS_API, + secure=True + ) + return client.auth.set_obtain_access_token_method(token_provider()) + From fba1e0531c95e5d8f71c5beabb0b54e74843c621 Mon Sep 17 00:00:00 2001 From: Rupeekshan Maheswaran Date: Mon, 20 Mar 2023 20:16:51 +0530 Subject: [PATCH 3/6] include FS details in the readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 9ae40ca..a881215 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ pip install h2o_authn==0.1.1 pip install https://enterprise-steam.s3.amazonaws.com/release/1.8.12/python/h2osteam-1.8.12-py2.py3-none-any.whl pip install https://s3.amazonaws.com/artifacts.h2o.ai/releases/ai/h2o/mlops/rel-0.56.1/2/h2o_mlops_client-0.56.1%2Bdd66f93.rel0.56.1.2-py2.py3-none-any.whl pip install https://h2o-release.s3.amazonaws.com/h2o/rel-zumbo/2/Python/h2o-3.36.1.2-py2.py3-none-any.whl +pip install pyspark==3.2.1 h2o-featurestore==0.14.4 ``` ### Setup your connection @@ -37,4 +38,5 @@ Update the `h2o_ai_cloud.py` file with the connection parameters for your H2O AI * H2O-3 additional tutorials: https://github.com/h2oai/h2o-tutorials * MLOps product documentation: https://docs.h2o.ai/mlops/ * MLOps python documentation: https://docs.h2o.ai/mlops/py-client-installing/ +* Feature Store product documentation: https://docs.h2o.ai/feature-store/latest-stable/docs/index.html From 5e8819042beb14ee689fe741b45411939174dddd Mon Sep 17 00:00:00 2001 From: Rupeekshan Maheswaran Date: Mon, 17 Apr 2023 23:37:15 +0530 Subject: [PATCH 4/6] add basic functions of Feature Store as notebook --- .DS_Store | Bin 0 -> 6148 bytes 6 Feature Store Workflow.ipynb | 631 ++++++++++++++++++++++++++ 6 Feature Store: Basic Workflow.ipynb | 377 --------------- 3 files changed, 631 insertions(+), 377 deletions(-) create mode 100644 .DS_Store create mode 100644 6 Feature Store Workflow.ipynb delete mode 100644 6 Feature Store: Basic Workflow.ipynb diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..356f284aca10ef18b6de7879d90ebf5e99aedbbf GIT binary patch literal 6148 zcmeHKJ#Q015Pb_SLMU)a6SShok|9ZfD9+{%5h;-cE>b43eSj6mhx`FbcRl}sA3($J zOXa+o9VCCmNfV(Y+L7ktc5mL?zTLaM1>l;~ULB|c*r1B9Y|>mI(l5FqkE}&Hono}$ z(ZC+<`<1(6Lu4Q_aMu`+ch{p%{fO6?e}4xkFhvh<+4qT2A6=Ycj0sLKqOGyzJI*mg zXKzq+JAGJ=)C4nR}*)2aJZ)uS6Nr{`+oZdL?Z+Ypt*Gcd5n} z-Vh_@llAd~xg!h|VQk1UiUvn9;iwLU%tK-E zp+=|pqE&vbX9e$w{U^-nY{vK*zHwFy91*)u+f@WvFEv>eODjTV>-XG>c!JN!I3Fn) z%b0P7-eQK%!h8k+qTDhAF5qoEy=B}87a52Q+#4_75xK7nUQ=jnJJvw9 d;^$Nq_IU~D6Z2*8Y@x&-0i_|D$iRPP;1V<&rz-#e literal 0 HcmV?d00001 diff --git a/6 Feature Store Workflow.ipynb b/6 Feature Store Workflow.ipynb new file mode 100644 index 0000000..c1882f7 --- /dev/null +++ b/6 Feature Store Workflow.ipynb @@ -0,0 +1,631 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Feature Store - A unified storage of curated features\n", + "\n", + "This notebook is intended to help you get started with Feature Store in the H2O AI Cloud using python.\n", + "\n", + "* **Product Documentation:** https://h2oai.github.io/featurestore/" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "from featurestore import CSVFile, Schema\n", + "from pyspark.sql import SparkSession\n", + "from h2o_ai_cloud import fs_client" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Configure User Spark session for Feature Store" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Set up Java Environment for Spark " + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "import os\n", + "from jdk4py import JAVA_HOME\n", + "os.environ['JAVA_HOME'] = str(JAVA_HOME)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "spark_dependencies_jar = \"https://s3.amazonaws.com/artifacts.h2o.ai/releases/ai/h2o/feature-store/release/0.15.0/spark-dependencies/featurestore-azure-gen2-spark-dependencies-0.15.0.jar\"\n", + "spark = SparkSession.builder \\\n", + " .master(\"local\") \\\n", + " .config(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:3.3.1,org.apache.hadoop:hadoop-azure:3.3.1,io.delta:delta-core_2.12:2.2.0\") \\\n", + " .config(\"spark.jars\", spark_dependencies_jar) \\\n", + " .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\") \\\n", + " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\") \\\n", + " .getOrCreate()\n", + "spark.sparkContext.setLogLevel(\"ERROR\")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + ":: loading settings :: url = jar:file:/opt/conda/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Ivy Default Cache set to: /home/jovyan/.ivy2/cache\n", + "The jars for the packages stored in: /home/jovyan/.ivy2/jars\n", + "org.apache.hadoop#hadoop-aws added as a dependency\n", + "org.apache.hadoop#hadoop-azure added as a dependency\n", + "io.delta#delta-core_2.12 added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-67b05020-a1d9-41be-9fae-9f8b614c61c3;1.0\n", + "\tconfs: [default]\n", + "\tfound org.apache.hadoop#hadoop-aws;3.3.1 in central\n", + "\tfound com.amazonaws#aws-java-sdk-bundle;1.11.901 in central\n", + "\tfound org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central\n", + "\tfound org.apache.hadoop#hadoop-azure;3.3.1 in central\n", + "\tfound org.apache.httpcomponents#httpclient;4.5.13 in central\n", + "\tfound org.apache.httpcomponents#httpcore;4.4.13 in central\n", + "\tfound commons-logging#commons-logging;1.1.3 in central\n", + "\tfound commons-codec#commons-codec;1.11 in central\n", + "\tfound com.microsoft.azure#azure-storage;7.0.1 in central\n", + "\tfound com.fasterxml.jackson.core#jackson-core;2.10.5 in central\n", + "\tfound org.slf4j#slf4j-api;1.7.30 in central\n", + "\tfound com.microsoft.azure#azure-keyvault-core;1.0.0 in central\n", + "\tfound com.google.guava#guava;27.0-jre in central\n", + "\tfound com.google.guava#failureaccess;1.0 in central\n", + "\tfound com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central\n", + "\tfound com.google.code.findbugs#jsr305;3.0.2 in central\n", + "\tfound org.checkerframework#checker-qual;2.5.2 in central\n", + "\tfound com.google.errorprone#error_prone_annotations;2.2.0 in central\n", + "\tfound com.google.j2objc#j2objc-annotations;1.1 in central\n", + "\tfound org.codehaus.mojo#animal-sniffer-annotations;1.17 in central\n", + "\tfound org.apache.hadoop.thirdparty#hadoop-shaded-guava;1.1.1 in central\n", + "\tfound org.eclipse.jetty#jetty-util-ajax;9.4.40.v20210413 in central\n", + "\tfound org.eclipse.jetty#jetty-util;9.4.40.v20210413 in central\n", + "\tfound org.codehaus.jackson#jackson-mapper-asl;1.9.13 in central\n", + "\tfound org.codehaus.jackson#jackson-core-asl;1.9.13 in central\n", + "\tfound io.delta#delta-core_2.12;2.2.0 in central\n", + "\tfound io.delta#delta-storage;2.2.0 in central\n", + "\tfound org.antlr#antlr4-runtime;4.8 in central\n", + ":: resolution report :: resolve 423ms :: artifacts dl 16ms\n", + "\t:: modules in use:\n", + "\tcom.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [default]\n", + "\tcom.fasterxml.jackson.core#jackson-core;2.10.5 from central in [default]\n", + "\tcom.google.code.findbugs#jsr305;3.0.2 from central in [default]\n", + "\tcom.google.errorprone#error_prone_annotations;2.2.0 from central in [default]\n", + "\tcom.google.guava#failureaccess;1.0 from central in [default]\n", + "\tcom.google.guava#guava;27.0-jre from central in [default]\n", + "\tcom.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default]\n", + "\tcom.google.j2objc#j2objc-annotations;1.1 from central in [default]\n", + "\tcom.microsoft.azure#azure-keyvault-core;1.0.0 from central in [default]\n", + "\tcom.microsoft.azure#azure-storage;7.0.1 from central in [default]\n", + "\tcommons-codec#commons-codec;1.11 from central in [default]\n", + "\tcommons-logging#commons-logging;1.1.3 from central in [default]\n", + "\tio.delta#delta-core_2.12;2.2.0 from central in [default]\n", + "\tio.delta#delta-storage;2.2.0 from central in [default]\n", + "\torg.antlr#antlr4-runtime;4.8 from central in [default]\n", + "\torg.apache.hadoop#hadoop-aws;3.3.1 from central in [default]\n", + "\torg.apache.hadoop#hadoop-azure;3.3.1 from central in [default]\n", + "\torg.apache.hadoop.thirdparty#hadoop-shaded-guava;1.1.1 from central in [default]\n", + "\torg.apache.httpcomponents#httpclient;4.5.13 from central in [default]\n", + "\torg.apache.httpcomponents#httpcore;4.4.13 from central in [default]\n", + "\torg.checkerframework#checker-qual;2.5.2 from central in [default]\n", + "\torg.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]\n", + "\torg.codehaus.jackson#jackson-mapper-asl;1.9.13 from central in [default]\n", + "\torg.codehaus.mojo#animal-sniffer-annotations;1.17 from central in [default]\n", + "\torg.eclipse.jetty#jetty-util;9.4.40.v20210413 from central in [default]\n", + "\torg.eclipse.jetty#jetty-util-ajax;9.4.40.v20210413 from central in [default]\n", + "\torg.slf4j#slf4j-api;1.7.30 from central in [default]\n", + "\torg.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 28 | 0 | 0 | 0 || 28 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-67b05020-a1d9-41be-9fae-9f8b614c61c3\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 28 already retrieved (0kB/10ms)\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "23/04/17 17:15:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Connect to Feature Store\n", + "We first connect to the Feature Store cloud endpoint using appropriate H2O Cloud discovery service inorder to initialize client. Then we can log into Feature Store." + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "client = fs_client()\n", + "client.auth.login()" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "17-04-2023 05:16:00 : INFO : client : Connecting to the server featurestore-api.cloud-qa.h2o.ai ...\n", + "17-04-2023 05:16:02 : ERROR : auth : Browser is not supported: Please visit https://auth.demo.h2o.ai/auth/realms/q8s-qa/protocol/openid-connect/auth?client_id=feature-store-qa&code_challenge=-IFutm4_E4ZeZiDe_Iqf35D1BPBCYcKGyKJGuxPWtwM&code_challenge_method=S256&redirect_uri=https://featurestore.cloud-qa.h2o.ai/Callback&response_type=code&scope=openid%20offline_access&state=gK8R62SM7l to continue authentication.\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Understand the environment" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "client.get_version()" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'0.15.0'" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Define data source \n", + "Feature Store supports different data sources - https://h2oai.github.io/featurestore/supported_data_sources" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 8, + "source": [ + "source = CSVFile(\"s3a://h2o-public-test-data/smalldata/gbm_test/titanic.csv\")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Extract schema from data source\n", + "The schema represents the features of the feature set" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 9, + "source": [ + "schema = client.extract_schema_from_source(source)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "17-04-2023 05:16:52 : INFO : interactive_console : Job ID: 01gy83ewsh7zhcmg8zcjewmh7e, Status: Finished setting up spark session.\n", + "17-04-2023 05:17:06 : INFO : interactive_console : Job ID: 01gy83ewsh7zhcmg8zcjewmh7e, Status: Finished reading data from source location to extract schema.\n", + "17-04-2023 05:17:06 : INFO : interactive_console : Job ID: 01gy83ewsh7zhcmg8zcjewmh7e, Status: Schema generation completed.\n", + "17-04-2023 05:17:06 : INFO : interactive_console : \n", + "\n", + "Time taken - 60.696 seconds\n" + ] + } + ], + "metadata": { + "tags": [] + } + }, + { + "cell_type": "markdown", + "source": [ + "## Create a project\n", + "User can follow naming conventions mentioned in here - https://h2oai.github.io/featurestore/api/naming_conventions" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 12, + "source": [ + "project = client.projects.create(\"sample_project\")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Create a feature set" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 13, + "source": [ + "feature_set = project.feature_sets.register(schema, \"sample_fs\")" + ], + "outputs": [], + "metadata": { + "tags": [] + } + }, + { + "cell_type": "markdown", + "source": [ + "## Ingest data from source\n", + "Uploading data into Feature Store" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 14, + "source": [ + "feature_set.ingest(source)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "17-04-2023 05:18:14 : INFO : interactive_console : Job ID: 01gy83j2wbpdkg9cfsqt77v5pq, Status: Finished setting up spark session.\n", + "17-04-2023 05:18:14 : INFO : interactive_console : Job ID: 01gy83j2wbpdkg9cfsqt77v5pq, Status: Finished reading data to ingest.\n", + "17-04-2023 05:18:22 : INFO : interactive_console : Job ID: 01gy83j2wbpdkg9cfsqt77v5pq, Status: Finished extracting scope from the data.\n", + "17-04-2023 05:18:42 : INFO : interactive_console : Job ID: 01gy83j2wbpdkg9cfsqt77v5pq, Status: Finished computation of incremental statistics.\n", + "17-04-2023 05:19:57 : INFO : interactive_console : Job ID: 01gy83j2wbpdkg9cfsqt77v5pq, Status: Finished writing data to main storage.\n", + "17-04-2023 05:20:01 : INFO : interactive_console : \n", + "\n", + "Time taken - 131.536 seconds\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{\n", + " \"rawCacheLocation\": \"01879039-0b43-d3e8-f649-3007b595fde7/01879039-0b5c-68d9-de71-de543814f45b-f6b4c38b-f891-4346-acef-4bd76e0d0476-raw\",\n", + " \"ingestionTimestamp\": \"2023-04-17T17:18:13.096298028Z\",\n", + " \"ingestScope\": {\n", + " \"startDateTime\": \"2023-04-17T17:18:13.096298028Z\",\n", + " \"endDateTime\": \"2023-04-17T17:18:13.096298028Z\"\n", + " },\n", + " \"ingestId\": \"01gy83js39km6f7ke8tjqgy8d0\",\n", + " \"cacheLocation\": \"\",\n", + " \"message\": \"\"\n", + "}" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Retrieve the data" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 15, + "source": [ + "reference = feature_set.retrieve()" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Download features\n", + "Download the files from Feature Store" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 16, + "source": [ + "reference.download()" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "17-04-2023 05:20:28 : INFO : interactive_console : Job ID: 01gy83p3bk6x9qgegky2n3pqbe, Status: Finished setting up spark session.\n", + "17-04-2023 05:21:06 : INFO : interactive_console : Job ID: 01gy83p3bk6x9qgegky2n3pqbe, Status: Finished reading data from main storage.\n", + "17-04-2023 05:21:34 : INFO : interactive_console : Job ID: 01gy83p3bk6x9qgegky2n3pqbe, Status: Finished writing data to retrieve storage.\n", + "17-04-2023 05:21:34 : INFO : interactive_console : Job ID: 01gy83p3bk6x9qgegky2n3pqbe, Status: Finished generating pre-signed urls.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/tmp/tmpbbu36pmt'" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ], + "metadata": { + "tags": [] + } + }, + { + "cell_type": "markdown", + "source": [ + "## Obtain data as a Spark Frame \n", + "Download features as spark dataframe" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 17, + "source": [ + "reference.as_spark_frame(spark).show()" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[Stage 8:> (0 + 1) / 1]\r" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+------+--------+--------------------+------+----+-----+-----+----------------+--------+-----------+--------+----+----+--------------------+---------------------------------+\n", + "|pclass|survived| name| sex| age|sibsp|parch| ticket| fare| cabin|embarked|boat|body| home.dest|time_travel_column_auto_generated|\n", + "+------+--------+--------------------+------+----+-----+-----+----------------+--------+-----------+--------+----+----+--------------------+---------------------------------+\n", + "| 1| 1|Cardeza Mr. Thom...| male|36.0| 0| 1| PC 17755|512.3292|B51 B53 B55| C| 3|null|Austria-Hungary /...| 2023-04-17 17:18:13|\n", + "| 2| 0|Hickman Mr. Stan...| male|21.0| 2| 0| S.O.C. 14879| 73.5| null| S|null|null|West Hampstead L...| 2023-04-17 17:18:13|\n", + "| 2| 0| Hold Mr. Stephen| male|44.0| 1| 0| 26707| 26.0| null| S|null|null|England / Sacrame...| 2023-04-17 17:18:13|\n", + "| 2| 0|Parkes Mr. Franc...| male|null| 0| 0| 239853| 0.0| null| S|null|null| Belfast| 2023-04-17 17:18:13|\n", + "| 2| 1|Sinkkonen Miss. ...|female|30.0| 0| 0| 250648| 13.0| null| S| 10|null|Finland / Washing...| 2023-04-17 17:18:13|\n", + "| 3| 1|Abrahamsson Mr. ...| male|20.0| 0| 0|SOTON/O2 3101284| 7.925| null| S| 15|null|Taalintehdas Fin...| 2023-04-17 17:18:13|\n", + "| 3| 0| Barry Miss. Julia|female|27.0| 0| 0| 330844| 7.8792| null| Q|null|null| New York NY| 2023-04-17 17:18:13|\n", + "| 3| 0| Lockyer Mr. Edward| male|null| 0| 0| 1222| 7.8792| null| S|null| 153| null| 2023-04-17 17:18:13|\n", + "| 3| 1|Nilsson Miss. He...|female|26.0| 0| 0| 347470| 7.8542| null| S| 13|null| null| 2023-04-17 17:18:13|\n", + "| 3| 0|Robins Mrs. Alex...|female|47.0| 1| 0| A/5. 3337| 14.5| null| S|null| 7| null| 2023-04-17 17:18:13|\n", + "| 3| 0|Skoog Miss. Marg...|female| 2.0| 3| 2| 347088| 27.9| null| S|null|null| null| 2023-04-17 17:18:13|\n", + "| 1| 1|Chambers Mrs. No...|female|33.0| 1| 0| 113806| 53.1| E8| S| 5|null|New York NY / It...| 2023-04-17 17:18:13|\n", + "| 1| 0|Douglas Mr. Walt...| male|50.0| 1| 0| PC 17761| 106.425| C86| C|null| 62|Deephaven MN / C...| 2023-04-17 17:18:13|\n", + "| 1| 1|Duff Gordon Sir....| male|49.0| 1| 0| PC 17485| 56.9292| A20| C| 1|null| London / Paris| 2023-04-17 17:18:13|\n", + "| 1| 0|Hilliard Mr. Her...| male|null| 0| 0| 17463| 51.8625| E46| S|null|null| Brighton MA| 2023-04-17 17:18:13|\n", + "| 1| 1|Longley Miss. Gr...|female|21.0| 0| 0| 13502| 77.9583| D9| S| 10|null| Hudson NY| 2023-04-17 17:18:13|\n", + "| 2| 0|Corey Mrs. Percy...|female|null| 0| 0| F.C.C. 13534| 21.0| null| S|null|null|Upper Burma Indi...| 2023-04-17 17:18:13|\n", + "| 2| 0|Sobey Mr. Samuel...| male|25.0| 0| 0| C.A. 29178| 13.0| null| S|null|null|Cornwall / Hought...| 2023-04-17 17:18:13|\n", + "| 3| 0|Ibrahim Shawah M...| male|30.0| 0| 0| 2685| 7.2292| null| C|null|null| null| 2023-04-17 17:18:13|\n", + "| 3| 0| Kink Mr. Vincenz| male|26.0| 2| 0| 315151| 8.6625| null| S|null|null| null| 2023-04-17 17:18:13|\n", + "+------+--------+--------------------+------+----+-----+-----+----------------+--------+-----------+--------+----+----+--------------------+---------------------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + " \r" + ] + } + ], + "metadata": { + "tags": [] + } + }, + { + "cell_type": "markdown", + "source": [ + "### Prepare a schema from a string\n", + "Schema can be created from a string format" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 18, + "source": [ + "schema_str = \"id integer, value string\"\n", + "schema = Schema.create_from(schema_str)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Create another feature set" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 19, + "source": [ + "fs_online = project.feature_sets.register(schema, \"sample_fs_online\", primary_key=\"id\")" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Ingest data from Online Feature Store" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 20, + "source": [ + "fs_online.ingest_online('{\"id\": 1, \"value\": \"test\"}')" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Retrieve data from Online Feature Store" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 22, + "source": [ + "fs_online.retrieve_online(1)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'id': 1, 'value': 'test'}" + ] + }, + "metadata": {}, + "execution_count": 22 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Delete a feature set" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 23, + "source": [ + "fs_online.delete()" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "17-04-2023 05:22:06 : INFO : feature_set : Feature set 'sample_fs_online' is deleted\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "## Delete a project" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 24, + "source": [ + "project.delete()" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "17-04-2023 05:22:06 : INFO : project : Project 'sample_project' is deleted\n" + ] + } + ], + "metadata": {} + } + ], + "metadata": { + "interpreter": { + "hash": "b5803137338cb19a16337a87a205be3b478f8cca74095ec6d83bca1ed0847cec" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.16 64-bit ('3.8.16': pyenv)" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/6 Feature Store: Basic Workflow.ipynb b/6 Feature Store: Basic Workflow.ipynb deleted file mode 100644 index 419dd2b..0000000 --- a/6 Feature Store: Basic Workflow.ipynb +++ /dev/null @@ -1,377 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Feature Store - A unified storage of curated features\n", - "\n", - "This notebook is intended to help you get started with Feature Store in the H2O AI Cloud using python.\n", - "\n", - "* **Product Documentation:** https://docs.h2o.ai/feature-store/latest-stable/docs/index.html" - ], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Prerequistes\n", - "\n", - "Update the `h2o_ai_cloud.py` file with the connection parameters for your H2O AI Cloud environemnt:\n", - "1. Login to your H2O AI Cloud environment\n", - "1. Click your username or avatar in the H2O AI Cloud navigation bar\n", - "1. Navigate to `CLI & API Access`\n", - "1. Use the variables from the `Accessing H2O AI Cloud APIs` section to populate the parameters" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 3, - "source": [ - "from getpass import getpass\n", - "\n", - "from h2o_ai_cloud import token_provider, fs_client\n", - "from featurestore import CSVFile, Schema\n", - "\n", - "from pyspark.sql import SparkSession" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Securely connect to the platform\n", - "We first connect to the H2O AI Cloud using our platform token to create a token provider object. We can then use this object to log into Feature Store." - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 5, - "source": [ - "client = fs_client(token_provider())" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Understand the environment" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "client.get_version()" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Configure spark for Feature Store " - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "source": [ - "spark = SparkSession.builder \\\n", - " .master(\"local\") \\\n", - " .config(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.12.238,io.delta:delta-core_2.12:1.2.1\") \\\n", - " .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\") \\\n", - " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\") \\\n", - " .getOrCreate()\n", - "\n", - "spark.sparkContext.setLogLevel(\"ERROR\")" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Define data source \n", - "Feature Store supports different data sources - https://docs.h2o.ai/feature-store/latest-stable/docs/supported_data_sources.html" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 27, - "source": [ - "source = CSVFile(\"s3a://h2o-public-test-data/smalldata/gbm_test/titanic.csv\")" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Extract schema from data source\n", - "The schema represents the features of the feature set" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 4, - "source": [ - "schema = client.extract_schema_from_source(source)" - ], - "outputs": [], - "metadata": { - "tags": [] - } - }, - { - "cell_type": "markdown", - "source": [ - "## Create a project\n", - "User can follow naming conventions mentioned in here - https://docs.h2o.ai/feature-store/latest-stable/docs/api/naming_conventions.html" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 29, - "source": [ - "project = client.projects.create(\"sample_project\")" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Create a feature set" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 30, - "source": [ - "feature_set = project.feature_sets.register(schema, \"sample_fs\")" - ], - "outputs": [], - "metadata": { - "tags": [] - } - }, - { - "cell_type": "markdown", - "source": [ - "## Ingest data from source\n", - "Uploading data into Feature Store" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 5, - "source": [ - "feature_set.ingest(source)" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Retrieve the data" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 32, - "source": [ - "reference = feature_set.retrieve()" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Download features\n", - "Download the files from Feature Store" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 6, - "source": [ - "reference.download()" - ], - "outputs": [], - "metadata": { - "tags": [] - } - }, - { - "cell_type": "markdown", - "source": [ - "## Obtain data as a Spark Frame \n", - "Download features as spark dataframe" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 7, - "source": [ - "reference.as_spark_frame(spark).show()" - ], - "outputs": [], - "metadata": { - "tags": [] - } - }, - { - "cell_type": "markdown", - "source": [ - "### Prepare a schema from a string\n", - "Schema can be created from a string format" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 35, - "source": [ - "schema_str = \"id integer, value string\"\n", - "schema = Schema.create_from(schema_str)" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "### Create another feature set" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 36, - "source": [ - "fs_online = project.feature_sets.register(schema, \"sample_fs_online\", primary_key=\"id\")" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Ingest data from Online Feature Store" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 37, - "source": [ - "fs_online.ingest_online('{\"id\": 1, \"value\": \"test\"}')" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Retrieve data from Online Feature Store" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 10, - "source": [ - "fs_online.retrieve_online(1)" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Delete a feature set" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 8, - "source": [ - "fs_online.delete()" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Delete a project" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 9, - "source": [ - "project.delete()" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "markdown", - "source": [ - "## Clean up" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 2, - "source": [ - "!rm " - ], - "outputs": [], - "metadata": {} - } - ], - "metadata": { - "kernelspec": { - "name": "python3", - "display_name": "Python 3.8.16 64-bit ('3.8.16': pyenv)" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.16" - }, - "interpreter": { - "hash": "b5803137338cb19a16337a87a205be3b478f8cca74095ec6d83bca1ed0847cec" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file From e8f8c10eeff21facc050ea16a1585c61898008b7 Mon Sep 17 00:00:00 2001 From: Rupeekshan Maheswaran Date: Mon, 17 Apr 2023 23:37:32 +0530 Subject: [PATCH 5/6] fs client initialization --- h2o_ai_cloud.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/h2o_ai_cloud.py b/h2o_ai_cloud.py index a0ab257..a768edc 100644 --- a/h2o_ai_cloud.py +++ b/h2o_ai_cloud.py @@ -1,8 +1,10 @@ +import os import getpass import h2o_authn import h2o_mlops_client import h2osteam +import h2o_discovery import featurestore @@ -58,15 +60,14 @@ def steam_client(): ) -def fs_client(token_provider): +def fs_client(): """ Connect to Feature Store """ - FS_API = "https://featurestore." + H2O_CLOUD_URL - + discovery = h2o_discovery.discover(os.environ['H2O_CLOUD_ENVIRONMENT']) client = featurestore.Client( - url=FS_API, + url=discovery.services['feature-store-grpc-api'].uri, secure=True ) - return client.auth.set_obtain_access_token_method(token_provider()) + return client From e09f1f908c837bc5a861c3bf1babed8842c4ed42 Mon Sep 17 00:00:00 2001 From: Rupeekshan Maheswaran Date: Fri, 28 Apr 2023 20:44:19 +0530 Subject: [PATCH 6/6] improvise wordings --- 6 Feature Store Workflow.ipynb | 312 +++++++++++++++++---------------- 1 file changed, 159 insertions(+), 153 deletions(-) diff --git a/6 Feature Store Workflow.ipynb b/6 Feature Store Workflow.ipynb index c1882f7..d62be68 100644 --- a/6 Feature Store Workflow.ipynb +++ b/6 Feature Store Workflow.ipynb @@ -1,77 +1,69 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", + "metadata": {}, "source": [ - "# Feature Store - A unified storage of curated features\n", + "# Feature Store - Unified storage of curated features\n", "\n", - "This notebook is intended to help you get started with Feature Store in the H2O AI Cloud using python.\n", + "This notebook is intended to help you get started with Feature Store in the H2O AI Cloud using Python.\n", "\n", "* **Product Documentation:** https://h2oai.github.io/featurestore/" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "from featurestore import CSVFile, Schema\n", "from pyspark.sql import SparkSession\n", "from h2o_ai_cloud import fs_client" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Configure User Spark session for Feature Store" - ], - "metadata": {} + ] }, { + "attachments": {}, "cell_type": "markdown", + "metadata": {}, "source": [ - "### Set up Java Environment for Spark " - ], - "metadata": {} + "### Set up the Java Environment for Spark " + ] }, { "cell_type": "code", "execution_count": 4, + "metadata": {}, + "outputs": [], "source": [ "import os\n", "from jdk4py import JAVA_HOME\n", "os.environ['JAVA_HOME'] = str(JAVA_HOME)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 5, - "source": [ - "spark_dependencies_jar = \"https://s3.amazonaws.com/artifacts.h2o.ai/releases/ai/h2o/feature-store/release/0.15.0/spark-dependencies/featurestore-azure-gen2-spark-dependencies-0.15.0.jar\"\n", - "spark = SparkSession.builder \\\n", - " .master(\"local\") \\\n", - " .config(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:3.3.1,org.apache.hadoop:hadoop-azure:3.3.1,io.delta:delta-core_2.12:2.2.0\") \\\n", - " .config(\"spark.jars\", spark_dependencies_jar) \\\n", - " .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\") \\\n", - " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\") \\\n", - " .getOrCreate()\n", - "spark.sparkContext.setLogLevel(\"ERROR\")" - ], + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ ":: loading settings :: url = jar:file:/opt/conda/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" ] }, { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "Ivy Default Cache set to: /home/jovyan/.ivy2/cache\n", "The jars for the packages stored in: /home/jovyan/.ivy2/jars\n", @@ -150,112 +142,124 @@ ] }, { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "23/04/17 17:15:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" ] }, { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" ] } ], - "metadata": {} + "source": [ + "spark_dependencies_jar = \"https://s3.amazonaws.com/artifacts.h2o.ai/releases/ai/h2o/feature-store/release/0.15.0/spark-dependencies/featurestore-azure-gen2-spark-dependencies-0.15.0.jar\"\n", + "spark = SparkSession.builder \\\n", + " .master(\"local\") \\\n", + " .config(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:3.3.1,org.apache.hadoop:hadoop-azure:3.3.1,io.delta:delta-core_2.12:2.2.0\") \\\n", + " .config(\"spark.jars\", spark_dependencies_jar) \\\n", + " .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\") \\\n", + " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\") \\\n", + " .getOrCreate()\n", + "spark.sparkContext.setLogLevel(\"ERROR\")" + ] }, { + "attachments": {}, "cell_type": "markdown", + "metadata": {}, "source": [ "## Connect to Feature Store\n", - "We first connect to the Feature Store cloud endpoint using appropriate H2O Cloud discovery service inorder to initialize client. Then we can log into Feature Store." - ], - "metadata": {} + "We first connect to the Feature Store cloud endpoint using appropriate H2O Cloud Discovery Service in order to initialize the client. Then we are able to authenticate into Feature Store." + ] }, { "cell_type": "code", "execution_count": 6, - "source": [ - "client = fs_client()\n", - "client.auth.login()" - ], + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "17-04-2023 05:16:00 : INFO : client : Connecting to the server featurestore-api.cloud-qa.h2o.ai ...\n", "17-04-2023 05:16:02 : ERROR : auth : Browser is not supported: Please visit https://auth.demo.h2o.ai/auth/realms/q8s-qa/protocol/openid-connect/auth?client_id=feature-store-qa&code_challenge=-IFutm4_E4ZeZiDe_Iqf35D1BPBCYcKGyKJGuxPWtwM&code_challenge_method=S256&redirect_uri=https://featurestore.cloud-qa.h2o.ai/Callback&response_type=code&scope=openid%20offline_access&state=gK8R62SM7l to continue authentication.\n" ] } ], - "metadata": {} + "source": [ + "client = fs_client()\n", + "client.auth.login()" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Understand the environment" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 7, - "source": [ - "client.get_version()" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "'0.15.0'" ] }, + "execution_count": 7, "metadata": {}, - "execution_count": 7 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "client.get_version()" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Define data source \n", "Feature Store supports different data sources - https://h2oai.github.io/featurestore/supported_data_sources" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 8, + "metadata": {}, + "outputs": [], "source": [ "source = CSVFile(\"s3a://h2o-public-test-data/smalldata/gbm_test/titanic.csv\")" - ], - "outputs": [], - "metadata": {} + ] }, { + "attachments": {}, "cell_type": "markdown", + "metadata": {}, "source": [ - "## Extract schema from data source\n", + "## Extract the schema from the data source\n", "The schema represents the features of the feature set" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 9, - "source": [ - "schema = client.extract_schema_from_source(source)" - ], + "metadata": { + "tags": [] + }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "17-04-2023 05:16:52 : INFO : interactive_console : Job ID: 01gy83ewsh7zhcmg8zcjewmh7e, Status: Finished setting up spark session.\n", "17-04-2023 05:17:06 : INFO : interactive_console : Job ID: 01gy83ewsh7zhcmg8zcjewmh7e, Status: Finished reading data from source location to extract schema.\n", @@ -266,63 +270,62 @@ ] } ], - "metadata": { - "tags": [] - } + "source": [ + "schema = client.extract_schema_from_source(source)" + ] }, { + "attachments": {}, "cell_type": "markdown", + "metadata": {}, "source": [ "## Create a project\n", - "User can follow naming conventions mentioned in here - https://h2oai.github.io/featurestore/api/naming_conventions" - ], - "metadata": {} + "Users must follow the project naming conventions outlined on https://h2oai.github.io/featurestore/api/naming_conventions" + ] }, { "cell_type": "code", "execution_count": 12, + "metadata": {}, + "outputs": [], "source": [ "project = client.projects.create(\"sample_project\")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Create a feature set" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 13, - "source": [ - "feature_set = project.feature_sets.register(schema, \"sample_fs\")" - ], - "outputs": [], "metadata": { "tags": [] - } + }, + "outputs": [], + "source": [ + "feature_set = project.feature_sets.register(schema, \"sample_fs\")" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Ingest data from source\n", "Uploading data into Feature Store" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 14, - "source": [ - "feature_set.ingest(source)" - ], + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "17-04-2023 05:18:14 : INFO : interactive_console : Job ID: 01gy83j2wbpdkg9cfsqt77v5pq, Status: Finished setting up spark session.\n", "17-04-2023 05:18:14 : INFO : interactive_console : Job ID: 01gy83j2wbpdkg9cfsqt77v5pq, Status: Finished reading data to ingest.\n", @@ -335,7 +338,6 @@ ] }, { - "output_type": "execute_result", "data": { "text/plain": [ "{\n", @@ -351,46 +353,49 @@ "}" ] }, + "execution_count": 14, "metadata": {}, - "execution_count": 14 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "feature_set.ingest(source)" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Retrieve the data" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 15, + "metadata": {}, + "outputs": [], "source": [ "reference = feature_set.retrieve()" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Download features\n", "Download the files from Feature Store" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 16, - "source": [ - "reference.download()" - ], + "metadata": { + "tags": [] + }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "17-04-2023 05:20:28 : INFO : interactive_console : Job ID: 01gy83p3bk6x9qgegky2n3pqbe, Status: Finished setting up spark session.\n", "17-04-2023 05:21:06 : INFO : interactive_console : Job ID: 01gy83p3bk6x9qgegky2n3pqbe, Status: Finished reading data from main storage.\n", @@ -399,45 +404,46 @@ ] }, { - "output_type": "execute_result", "data": { "text/plain": [ "'/tmp/tmpbbu36pmt'" ] }, + "execution_count": 16, "metadata": {}, - "execution_count": 16 + "output_type": "execute_result" } ], - "metadata": { - "tags": [] - } + "source": [ + "reference.download()" + ] }, { + "attachments": {}, "cell_type": "markdown", + "metadata": {}, "source": [ "## Obtain data as a Spark Frame \n", - "Download features as spark dataframe" - ], - "metadata": {} + "Download features as a Spark dataframe" + ] }, { "cell_type": "code", "execution_count": 17, - "source": [ - "reference.as_spark_frame(spark).show()" - ], + "metadata": { + "tags": [] + }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "[Stage 8:> (0 + 1) / 1]\r" ] }, { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------+--------+--------------------+------+----+-----+-----+----------------+--------+-----------+--------+----+----+--------------------+---------------------------------+\n", "|pclass|survived| name| sex| age|sibsp|parch| ticket| fare| cabin|embarked|boat|body| home.dest|time_travel_column_auto_generated|\n", @@ -468,141 +474,141 @@ ] }, { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ " \r" ] } ], - "metadata": { - "tags": [] - } + "source": [ + "reference.as_spark_frame(spark).show()" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Prepare a schema from a string\n", "Schema can be created from a string format" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 18, + "metadata": {}, + "outputs": [], "source": [ "schema_str = \"id integer, value string\"\n", "schema = Schema.create_from(schema_str)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Create another feature set" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 19, + "metadata": {}, + "outputs": [], "source": [ "fs_online = project.feature_sets.register(schema, \"sample_fs_online\", primary_key=\"id\")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Ingest data from Online Feature Store" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 20, + "metadata": {}, + "outputs": [], "source": [ "fs_online.ingest_online('{\"id\": 1, \"value\": \"test\"}')" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Retrieve data from Online Feature Store" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 22, - "source": [ - "fs_online.retrieve_online(1)" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "{'id': 1, 'value': 'test'}" ] }, + "execution_count": 22, "metadata": {}, - "execution_count": 22 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "fs_online.retrieve_online(1)" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Delete a feature set" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 23, - "source": [ - "fs_online.delete()" - ], + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "17-04-2023 05:22:06 : INFO : feature_set : Feature set 'sample_fs_online' is deleted\n" ] } ], - "metadata": {} + "source": [ + "fs_online.delete()" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Delete a project" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 24, - "source": [ - "project.delete()" - ], + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "17-04-2023 05:22:06 : INFO : project : Project 'sample_project' is deleted\n" ] } ], - "metadata": {} + "source": [ + "project.delete()" + ] } ], "metadata": { @@ -610,8 +616,8 @@ "hash": "b5803137338cb19a16337a87a205be3b478f8cca74095ec6d83bca1ed0847cec" }, "kernelspec": { - "name": "python3", - "display_name": "Python 3.8.16 64-bit ('3.8.16': pyenv)" + "display_name": "Python 3.8.16 64-bit ('3.8.16': pyenv)", + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -628,4 +634,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +}