From 676d135640831dcf1b7088c20a48eeb270e01279 Mon Sep 17 00:00:00 2001 From: Dhananjay Mukhedkar <55157590+dhananjay-mk@users.noreply.github.com> Date: Tue, 2 Jul 2024 12:05:13 +0200 Subject: [PATCH 01/11] undo conflicted overwritten method (#1353) --- python/hsfs/util.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/hsfs/util.py b/python/hsfs/util.py index 6d4e095e14..9a694573cc 100644 --- a/python/hsfs/util.py +++ b/python/hsfs/util.py @@ -20,6 +20,7 @@ import json import logging import re +import sys import threading import time from datetime import date, datetime, timezone @@ -542,6 +543,13 @@ def build_serving_keys_from_prepared_statements( return serving_keys +def is_runtime_notebook(): + if "ipykernel" in sys.modules: + return True + else: + return False + + class NpDatetimeEncoder(json.JSONEncoder): def default(self, obj): dtypes = (np.datetime64, np.complexfloating) From fec5fb4f16a9916cd9f8ee1c8ccd803f048e8363 Mon Sep 17 00:00:00 2001 From: Aleksey Veresov Date: Wed, 3 Jul 2024 17:39:32 +0200 Subject: [PATCH 02/11] [FSTORE-1459] Fix spark initialization in client/external.py (#1356) `enableHiveSupport` on itself is a method, and has to be called to get a builder from it. --- python/hsfs/client/external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/hsfs/client/external.py b/python/hsfs/client/external.py index bfc47dc790..e99fc20b4c 100644 --- a/python/hsfs/client/external.py +++ b/python/hsfs/client/external.py @@ -111,7 +111,7 @@ def __init__( # are needed when the application starts (before user code is run) # So in this case, we can't materialize the certificates on the fly. _logger.debug("Running in Spark environment, initializing Spark session") - _spark_session = SparkSession.builder.enableHiveSupport.getOrCreate() + _spark_session = SparkSession.builder.enableHiveSupport().getOrCreate() self._validate_spark_configuration(_spark_session) with open( From b939fff7c21caf38fdd01af3eac0b5e6e3011451 Mon Sep 17 00:00:00 2001 From: Gibson Chikafa Date: Wed, 3 Jul 2024 19:52:56 +0200 Subject: [PATCH 03/11] [HWORKS-1373] Append MATERIAL_DIRECTORY env var value to certificates path (#1358) * Append MATERIAL_DIRECTORY env var value to certificates path * Also include MATERIAL_DIR in token.jwt --- .../hsfs/metadata/HopsworksInternalClient.java | 16 ++++++++++++---- utils/python/hdfs-file-operations.py | 0 2 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 utils/python/hdfs-file-operations.py diff --git a/java/hsfs/src/main/java/com/logicalclocks/hsfs/metadata/HopsworksInternalClient.java b/java/hsfs/src/main/java/com/logicalclocks/hsfs/metadata/HopsworksInternalClient.java index 93baf42dd0..755938c171 100644 --- a/java/hsfs/src/main/java/com/logicalclocks/hsfs/metadata/HopsworksInternalClient.java +++ b/java/hsfs/src/main/java/com/logicalclocks/hsfs/metadata/HopsworksInternalClient.java @@ -46,6 +46,7 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.nio.file.Paths; +import java.nio.file.Files; import java.nio.file.StandardOpenOption; import java.security.KeyManagementException; import java.security.KeyStoreException; @@ -60,11 +61,15 @@ public class HopsworksInternalClient implements HopsworksHttpClient { private static final Logger LOGGER = LoggerFactory.getLogger(HopsworksInternalClient.class.getName()); private static final String DOMAIN_CA_TRUSTSTORE = "hopsworks.domain.truststore"; - private static final String TOKEN_PATH = "token.jwt"; + private static final String TOKEN_PATH = (Paths.get(System.getenv("MATERIAL_DIRECTORY"), + "token.jwt")).toString(); - private static final String MATERIAL_PASSWD = "material_passwd"; - private static final String T_CERTIFICATE = "t_certificate"; - private static final String K_CERTIFICATE = "k_certificate"; + private static final String MATERIAL_PASSWD = (Paths.get(System.getenv("MATERIAL_DIRECTORY"), + "material_passwd")).toString(); + private static final String T_CERTIFICATE = (Paths.get(System.getenv("MATERIAL_DIRECTORY"), + "t_certificate")).toString(); + private static final String K_CERTIFICATE = (Paths.get(System.getenv("MATERIAL_DIRECTORY"), + "k_certificate")).toString(); private PoolingHttpClientConnectionManager connectionPool = null; @@ -109,6 +114,9 @@ private Registry createConnectionFactory() Properties systemProperties = System.getProperties(); Path trustStorePath = Paths.get(systemProperties.getProperty(DOMAIN_CA_TRUSTSTORE)); + if (trustStorePath == null || !Files.exists(trustStorePath)) { + trustStorePath = Paths.get(T_CERTIFICATE); + } LOGGER.info("Trust store path: " + trustStorePath); SSLContext sslCtx = SSLContexts.custom() .loadTrustMaterial(trustStorePath.toFile(), null, new TrustSelfSignedStrategy()) diff --git a/utils/python/hdfs-file-operations.py b/utils/python/hdfs-file-operations.py new file mode 100644 index 0000000000..e69de29bb2 From b2083433a0f80b6d47e904ad9059069e847dc42e Mon Sep 17 00:00:00 2001 From: Aleksey Veresov Date: Fri, 5 Jul 2024 11:18:53 +0200 Subject: [PATCH 04/11] [FSTORE-1459] Fix test_base_client.py (#1355) * Fix test_base_client.py It was impure, changing environ variable and thereby messing other tests. This commit wraps these tests in a decorator saving and restoring os.environ. * Move changes_environ to tests/utils.py --- python/tests/client/test_base_client.py | 4 ++++ python/tests/util.py | 31 +++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 python/tests/util.py diff --git a/python/tests/client/test_base_client.py b/python/tests/client/test_base_client.py index 1b16ce4b37..b90b3b6f15 100644 --- a/python/tests/client/test_base_client.py +++ b/python/tests/client/test_base_client.py @@ -20,9 +20,11 @@ import requests from hsfs.client.base import Client from hsfs.client.exceptions import RestAPIError +from tests.util import changes_environ class TestBaseClient: + @changes_environ def test_valid_token_no_retires(self, mocker): # Arrange os.environ[Client.REST_ENDPOINT] = "True" @@ -48,6 +50,7 @@ def test_valid_token_no_retires(self, mocker): # Assert assert spy_retry_token_expired.call_count == 0 + @changes_environ def test_invalid_token_retires(self, mocker): # Arrange os.environ[Client.REST_ENDPOINT] = "True" @@ -77,6 +80,7 @@ def test_invalid_token_retires(self, mocker): # Assert assert spy_retry_token_expired.call_count == 10 + @changes_environ def test_invalid_token_retires_backoff_break(self, mocker): # Arrange os.environ[Client.REST_ENDPOINT] = "True" diff --git a/python/tests/util.py b/python/tests/util.py new file mode 100644 index 0000000000..4e460ba95f --- /dev/null +++ b/python/tests/util.py @@ -0,0 +1,31 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +from functools import wraps + + +def changes_environ(f): + @wraps(f) + def g(*args, **kwds): + old_environ = os.environ.copy() + try: + return f(*args, **kwds) + finally: + os.environ.clear() + os.environ.update(old_environ) + + return g From a9ba98a49add95b23b7d99718086b54098ff06b9 Mon Sep 17 00:00:00 2001 From: Victor Jouffrey <37411285+vatj@users.noreply.github.com> Date: Fri, 5 Jul 2024 11:35:00 +0200 Subject: [PATCH 05/11] [FSTORE-1463] Refactor SQL dependencies out of util (#1362) * Refactor mysql dependencies out of util * Fix tests for mysql * Start cleaning some dependencies under type_checking --- python/hsfs/core/constants.py | 7 + .../hsfs/core/online_store_rest_client_api.py | 26 +++ python/hsfs/core/online_store_sql_engine.py | 43 +++-- python/hsfs/core/util_sql.py | 117 ++++++++++++++ python/hsfs/engine/python.py | 10 +- python/hsfs/util.py | 152 ++++-------------- python/tests/engine/test_python.py | 10 +- python/tests/test_util.py | 11 +- 8 files changed, 233 insertions(+), 143 deletions(-) create mode 100644 python/hsfs/core/util_sql.py diff --git a/python/hsfs/core/constants.py b/python/hsfs/core/constants.py index 66a997543e..c2cbd33a8b 100644 --- a/python/hsfs/core/constants.py +++ b/python/hsfs/core/constants.py @@ -13,3 +13,10 @@ "You will need to restart your kernel if applicable." ) initialise_expectation_suite_for_single_expectation_api_message = "Initialize Expectation Suite by attaching to a Feature Group to enable single expectation API" + +# Numpy +HAS_NUMPY: bool = importlib.util.find_spec("numpy") is not None + +# SQL packages +HAS_SQLALCHEMY: bool = importlib.util.find_spec("sqlalchemy") is not None +HAS_AIOMYSQL: bool = importlib.util.find_spec("aiomysql") is not None diff --git a/python/hsfs/core/online_store_rest_client_api.py b/python/hsfs/core/online_store_rest_client_api.py index 392ff87903..77baad2968 100644 --- a/python/hsfs/core/online_store_rest_client_api.py +++ b/python/hsfs/core/online_store_rest_client_api.py @@ -15,15 +15,41 @@ # import json import logging +from datetime import date, datetime from typing import Any, Dict from hsfs import util from hsfs.client import exceptions, online_store_rest_client +from hsfs.core.constants import HAS_NUMPY from requests import Response +if HAS_NUMPY: + import numpy as np + _logger = logging.getLogger(__name__) +if HAS_NUMPY: + + class NpDatetimeEncoder(json.JSONEncoder): + def default(self, obj): + dtypes = (np.datetime64, np.complexfloating) + if isinstance(obj, (datetime, date)): + return util.dateconvert_event_time_to_timestamp(obj) + elif isinstance(obj, dtypes): + return str(obj) + elif isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + if any([np.issubdtype(obj.dtype, i) for i in dtypes]): + return obj.astype(str).tolist() + return obj.tolist() + return super(NpDatetimeEncoder, self).default(obj) +else: + NpDatetimeEncoder = json.JSONEncoder + class OnlineStoreRestClientApi: SINGLE_VECTOR_ENDPOINT = "feature_store" diff --git a/python/hsfs/core/online_store_sql_engine.py b/python/hsfs/core/online_store_sql_engine.py index c2dd72c4a3..0f773063b6 100644 --- a/python/hsfs/core/online_store_sql_engine.py +++ b/python/hsfs/core/online_store_sql_engine.py @@ -19,15 +19,32 @@ import json import logging import re -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union -import aiomysql -import aiomysql.utils -from hsfs import feature_view, storage_connector, training_dataset, util -from hsfs.constructor.serving_prepared_statement import ServingPreparedStatement -from hsfs.core import feature_view_api, storage_connector_api, training_dataset_api -from hsfs.serving_key import ServingKey -from sqlalchemy import bindparam, exc, sql, text +from hsfs import util +from hsfs.core import ( + feature_view_api, + storage_connector_api, + training_dataset_api, +) +from hsfs.core.constants import HAS_AIOMYSQL, HAS_SQLALCHEMY + + +if TYPE_CHECKING: + from hsfs import feature_view, storage_connector, training_dataset + from hsfs.constructor.serving_prepared_statement import ServingPreparedStatement + from hsfs.serving_key import ServingKey + + +if HAS_AIOMYSQL: + import aiomysql + import aiomysql.utils + +if HAS_SQLALCHEMY: + from sqlalchemy import bindparam, exc, sql, text + +if HAS_AIOMYSQL and HAS_SQLALCHEMY: + from hsfs.core import util_sql _logger = logging.getLogger(__name__) @@ -76,7 +93,7 @@ def fetch_prepared_statements( entity: Union[feature_view.FeatureView, training_dataset.TrainingDataset], inference_helper_columns: bool, ) -> None: - if isinstance(entity, feature_view.FeatureView): + if hasattr(entity, "_feature_view_engine"): _logger.debug( f"Initialising prepared statements for feature view {entity.name} version {entity.version}." ) @@ -91,7 +108,7 @@ def fetch_prepared_statements( ) ) _logger.debug(f"{self.prepared_statements[key]}") - elif isinstance(entity, training_dataset.TrainingDataset): + elif hasattr(entity, "_training_dataset_type"): _logger.debug( f"Initialising prepared statements for training dataset {entity.name} version {entity.version}." ) @@ -457,7 +474,7 @@ def _set_mysql_connection(self, options=None): _logger.debug( f"Creating MySQL {'external' if self.external is True else ''}engine with options: {options}." ) - self._prepared_statement_engine = util.create_mysql_engine( + self._prepared_statement_engine = util_sql.create_mysql_engine( online_conn, self._external, options=options ) @@ -491,7 +508,7 @@ def _get_result_key( @staticmethod def _get_result_key_serving_key( - serving_keys: List["ServingKey"], result_dict: Dict[str, Dict[str, Any]] + serving_keys: List[ServingKey], result_dict: Dict[str, Dict[str, Any]] ) -> Tuple[str]: _logger.debug( f"Get result key serving key {serving_keys} from result dict {result_dict}" @@ -525,7 +542,7 @@ def get_prepared_statement_labels( ] async def _get_connection_pool(self, default_min_size: int) -> None: - self._connection_pool = await util.create_async_engine( + self._connection_pool = await util_sql.create_async_engine( self._online_connector, self._external, default_min_size, diff --git a/python/hsfs/core/util_sql.py b/python/hsfs/core/util_sql.py new file mode 100644 index 0000000000..93a34c535e --- /dev/null +++ b/python/hsfs/core/util_sql.py @@ -0,0 +1,117 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from __future__ import annotations + +import asyncio +import re +from typing import Any, Dict, Optional + +from hsfs import util +from hsfs.core.constants import HAS_AIOMYSQL, HAS_SQLALCHEMY + + +if HAS_SQLALCHEMY: + from sqlalchemy import create_engine + from sqlalchemy.engine.url import make_url + +if HAS_AIOMYSQL: + from aiomysql.sa import create_engine as async_create_engine + + +def create_mysql_engine( + online_conn: Any, external: bool, options: Optional[Dict[str, Any]] = None +) -> Any: + online_options = online_conn.spark_options() + # Here we are replacing the first part of the string returned by Hopsworks, + # jdbc:mysql:// with the sqlalchemy one + username and password + # useSSL and allowPublicKeyRetrieval are not valid properties for the pymysql driver + # to use SSL we'll have to something like this: + # ssl_args = {'ssl_ca': ca_path} + # engine = create_engine("mysql+pymysql://:@/", connect_args=ssl_args) + if external: + # This only works with external clients. + # Hopsworks clients should use the storage connector + host = util.get_host_name() + online_options["url"] = re.sub( + "/[0-9.]+:", + "/{}:".format(host), + online_options["url"], + ) + + sql_alchemy_conn_str = ( + online_options["url"] + .replace( + "jdbc:mysql://", + "mysql+pymysql://" + + online_options["user"] + + ":" + + online_options["password"] + + "@", + ) + .replace("useSSL=false&", "") + .replace("?allowPublicKeyRetrieval=true", "") + ) + if options is not None and not isinstance(options, dict): + raise TypeError("`options` should be a `dict` type.") + if not options: + options = {"pool_recycle": 3600} + elif "pool_recycle" not in options: + options["pool_recycle"] = 3600 + # default connection pool size kept by engine is 5 + sql_alchemy_engine = create_engine(sql_alchemy_conn_str, **options) + return sql_alchemy_engine + + +async def create_async_engine( + online_conn: Any, + external: bool, + default_min_size: int, + options: Optional[Dict[str, Any]] = None, + hostname: Optional[str] = None, +) -> Any: + try: + loop = asyncio.get_running_loop() + except RuntimeError as er: + raise RuntimeError( + "Event loop is not running. Please invoke this co-routine from a running loop or provide an event loop." + ) from er + + online_options = online_conn.spark_options() + # read the keys user, password from online_conn as use them while creating the connection pool + url = make_url(online_options["url"].replace("jdbc:", "")) + if hostname is None: + if external: + hostname = util.get_host_name() + else: + hostname = url.host + + # create a aiomysql connection pool + pool = await async_create_engine( + host=hostname, + port=3306, + user=online_options["user"], + password=online_options["password"], + db=url.database, + loop=loop, + minsize=( + options.get("minsize", default_min_size) if options else default_min_size + ), + maxsize=( + options.get("maxsize", default_min_size) if options else default_min_size + ), + pool_recycle=(options.get("pool_recycle", -1) if options else -1), + ) + return pool diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index caaecdffb1..5723a96424 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -83,7 +83,7 @@ transformation_function_engine, variable_api, ) -from hsfs.core.constants import HAS_GREAT_EXPECTATIONS +from hsfs.core.constants import HAS_AIOMYSQL, HAS_GREAT_EXPECTATIONS, HAS_SQLALCHEMY from hsfs.core.vector_db_client import VectorDbClient from hsfs.decorators import uses_great_expectations from hsfs.feature_group import ExternalFeatureGroup, FeatureGroup @@ -111,6 +111,12 @@ if HAS_GREAT_EXPECTATIONS: import great_expectations +if HAS_AIOMYSQL and HAS_SQLALCHEMY: + from hsfs.core import util_sql + +if HAS_SQLALCHEMY: + from sqlalchemy import sql + # Decimal types are currently not supported _INT_TYPES = [pa.uint8(), pa.uint16(), pa.int8(), pa.int16(), pa.int32()] _BIG_INT_TYPES = [pa.uint32(), pa.int64()] @@ -269,7 +275,7 @@ def _jdbc( self._validate_dataframe_type(dataframe_type) if self._mysql_online_fs_engine is None: - self._mysql_online_fs_engine = util.create_mysql_engine( + self._mysql_online_fs_engine = util_sql.create_mysql_engine( connector, ( isinstance(client.get_instance(), client.external.Client) diff --git a/python/hsfs/util.py b/python/hsfs/util.py index 9a694573cc..10aa3d733a 100644 --- a/python/hsfs/util.py +++ b/python/hsfs/util.py @@ -15,33 +15,39 @@ # from __future__ import annotations -import asyncio import itertools import json -import logging import re import sys import threading import time from datetime import date, datetime, timezone -from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Literal, + Optional, + Set, + Tuple, + Union, +) from urllib.parse import urljoin, urlparse -import numpy as np import pandas as pd -from aiomysql.sa import create_engine as async_create_engine -from hsfs import client, feature, feature_group, serving_key -from hsfs.client import exceptions +from hsfs import client, feature, serving_key from hsfs.client.exceptions import FeatureStoreException -from hsfs.constructor import serving_prepared_statement from hsfs.core import feature_group_api, variable_api -from sqlalchemy import create_engine -from sqlalchemy.engine.url import make_url -FEATURE_STORE_NAME_SUFFIX = "_featurestore" +if TYPE_CHECKING: + import feature_group + from hsfs.constructor import serving_prepared_statement + -_logger = logging.getLogger(__name__) +FEATURE_STORE_NAME_SUFFIX = "_featurestore" class FeatureStoreEncoder(json.JSONEncoder): @@ -128,50 +134,6 @@ def strip_feature_store_suffix(name: str) -> str: return name -def create_mysql_engine( - online_conn: Any, external: bool, options: Optional[Dict[str, Any]] = None -) -> Any: - online_options = online_conn.spark_options() - # Here we are replacing the first part of the string returned by Hopsworks, - # jdbc:mysql:// with the sqlalchemy one + username and password - # useSSL and allowPublicKeyRetrieval are not valid properties for the pymysql driver - # to use SSL we'll have to something like this: - # ssl_args = {'ssl_ca': ca_path} - # engine = create_engine("mysql+pymysql://:@/", connect_args=ssl_args) - if external: - # This only works with external clients. - # Hopsworks clients should use the storage connector - host = get_host_name() - online_options["url"] = re.sub( - "/[0-9.]+:", - "/{}:".format(host), - online_options["url"], - ) - - sql_alchemy_conn_str = ( - online_options["url"] - .replace( - "jdbc:mysql://", - "mysql+pymysql://" - + online_options["user"] - + ":" - + online_options["password"] - + "@", - ) - .replace("useSSL=false&", "") - .replace("?allowPublicKeyRetrieval=true", "") - ) - if options is not None and not isinstance(options, dict): - raise TypeError("`options` should be a `dict` type.") - if not options: - options = {"pool_recycle": 3600} - elif "pool_recycle" not in options: - options["pool_recycle"] = 3600 - # default connection pool size kept by engine is 5 - sql_alchemy_engine = create_engine(sql_alchemy_conn_str, **options) - return sql_alchemy_engine - - def get_host_name() -> str: host = variable_api.VariableApi().get_loadbalancer_external_domain() if host == "": @@ -188,48 +150,6 @@ def get_dataset_type(path: str) -> Literal["HIVEDB", "DATASET"]: return "DATASET" -async def create_async_engine( - online_conn: Any, - external: bool, - default_min_size: int, - options: Optional[Dict[str, Any]] = None, - hostname: Optional[str] = None, -) -> Any: - try: - loop = asyncio.get_running_loop() - except RuntimeError as er: - raise RuntimeError( - "Event loop is not running. Please invoke this co-routine from a running loop or provide an event loop." - ) from er - - online_options = online_conn.spark_options() - # read the keys user, password from online_conn as use them while creating the connection pool - url = make_url(online_options["url"].replace("jdbc:", "")) - if hostname is None: - if external: - hostname = get_host_name() - else: - hostname = url.host - - # create a aiomysql connection pool - pool = await async_create_engine( - host=hostname, - port=3306, - user=online_options["user"], - password=online_options["password"], - db=url.database, - loop=loop, - minsize=( - options.get("minsize", default_min_size) if options else default_min_size - ), - maxsize=( - options.get("maxsize", default_min_size) if options else default_min_size - ), - pool_recycle=(options.get("pool_recycle", -1) if options else -1), - ) - return pool - - def check_timestamp_format_from_date_string(input_date: str) -> Tuple[str, str]: date_format_patterns = { r"^([0-9]{4})([0-9]{2})([0-9]{2})$": "%Y%m%d", @@ -282,13 +202,15 @@ def get_timestamp_from_date_string(input_date: str) -> int: def get_hudi_datestr_from_timestamp(timestamp: int) -> str: - return datetime.utcfromtimestamp(timestamp / 1000).strftime("%Y%m%d%H%M%S%f")[:-3] + return datetime.fromtimestamp(timestamp / 1000, timezone.utc).strftime( + "%Y%m%d%H%M%S%f" + )[:-3] def get_delta_datestr_from_timestamp(timestamp: int) -> str: - return datetime.utcfromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S.%f")[ - :-3 - ] + return datetime.fromtimestamp(timestamp / 1000, timezone.utc).strftime( + "%Y-%m-%d %H:%M:%S.%f" + )[:-3] def convert_event_time_to_timestamp( @@ -389,13 +311,13 @@ def verify_attribute_key_names( if feature_group_obj.primary_key: diff = set(feature_group_obj.primary_key) - feature_names if diff: - raise exceptions.FeatureStoreException( + raise FeatureStoreException( f"Provided primary key(s) {','.join(diff)} doesn't exist in feature dataframe" ) if feature_group_obj.event_time: if feature_group_obj.event_time not in feature_names: - raise exceptions.FeatureStoreException( + raise FeatureStoreException( f"Provided event_time feature {feature_group_obj.event_time} doesn't exist in feature dataframe" ) @@ -403,13 +325,13 @@ def verify_attribute_key_names( if feature_group_obj.partition_key: diff = set(feature_group_obj.partition_key) - feature_names if diff: - raise exceptions.FeatureStoreException( + raise FeatureStoreException( f"Provided partition key(s) {','.join(diff)} doesn't exist in feature dataframe" ) if feature_group_obj.hudi_precombine_key: if feature_group_obj.hudi_precombine_key not in feature_names: - raise exceptions.FeatureStoreException( + raise FeatureStoreException( f"Provided hudi precombine key {feature_group_obj.hudi_precombine_key} " f"doesn't exist in feature dataframe" ) @@ -550,24 +472,6 @@ def is_runtime_notebook(): return False -class NpDatetimeEncoder(json.JSONEncoder): - def default(self, obj): - dtypes = (np.datetime64, np.complexfloating) - if isinstance(obj, (datetime, date)): - return convert_event_time_to_timestamp(obj) - elif isinstance(obj, dtypes): - return str(obj) - elif isinstance(obj, np.integer): - return int(obj) - elif isinstance(obj, np.floating): - return float(obj) - elif isinstance(obj, np.ndarray): - if any([np.issubdtype(obj.dtype, i) for i in dtypes]): - return obj.astype(str).tolist() - return obj.tolist() - return super(NpDatetimeEncoder, self).default(obj) - - class VersionWarning(Warning): pass diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index de070ea047..e6a0b1213b 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -125,7 +125,9 @@ def test_sql_offline_dataframe_type_none(self, mocker): def test_jdbc(self, mocker): # Arrange - mock_util_create_mysql_engine = mocker.patch("hsfs.util.create_mysql_engine") + mock_util_create_mysql_engine = mocker.patch( + "hsfs.core.util_sql.create_mysql_engine" + ) mocker.patch("hsfs.client.get_instance") mock_python_engine_return_dataframe_type = mocker.patch( "hsfs.engine.python.Engine._return_dataframe_type" @@ -145,7 +147,7 @@ def test_jdbc(self, mocker): def test_jdbc_dataframe_type_none(self, mocker): # Arrange - mocker.patch("hsfs.util.create_mysql_engine") + mocker.patch("hsfs.core.util_sql.create_mysql_engine") mocker.patch("hsfs.client.get_instance") query = "SELECT * FROM TABLE" @@ -165,7 +167,9 @@ def test_jdbc_dataframe_type_none(self, mocker): def test_jdbc_read_options(self, mocker): # Arrange - mock_util_create_mysql_engine = mocker.patch("hsfs.util.create_mysql_engine") + mock_util_create_mysql_engine = mocker.patch( + "hsfs.core.util_sql.create_mysql_engine" + ) mocker.patch("hsfs.client.get_instance") mock_python_engine_return_dataframe_type = mocker.patch( "hsfs.engine.python.Engine._return_dataframe_type" diff --git a/python/tests/test_util.py b/python/tests/test_util.py index 8ad7be1da6..217611bd1f 100644 --- a/python/tests/test_util.py +++ b/python/tests/test_util.py @@ -21,11 +21,16 @@ import pytz from hsfs import util from hsfs.client.exceptions import FeatureStoreException +from hsfs.core.constants import HAS_AIOMYSQL, HAS_SQLALCHEMY from hsfs.embedding import EmbeddingFeature, EmbeddingIndex from hsfs.feature import Feature from mock import patch +if HAS_SQLALCHEMY and HAS_AIOMYSQL: + from hsfs.core import util_sql + + class TestUtil: def test_get_hudi_datestr_from_timestamp(self): dt = util.get_hudi_datestr_from_timestamp(1640995200000) @@ -209,6 +214,10 @@ def test_empty_schema(self): util.validate_embedding_feature_type(embedding_index, schema) # No exception should be raised + @pytest.mark.skipif( + not HAS_SQLALCHEMY or not HAS_AIOMYSQL, + reason="SQLAlchemy or aiomysql is not installed", + ) def test_create_async_engine(self, mocker): # Test when get_running_loop() raises a RuntimeError with patch("asyncio.get_running_loop", side_effect=RuntimeError): @@ -218,4 +227,4 @@ def test_create_async_engine(self, mocker): RuntimeError, match="Event loop is not running. Please invoke this co-routine from a running loop or provide an event loop.", ): - asyncio.run(util.create_async_engine(online_connector, True, 1)) + asyncio.run(util_sql.create_async_engine(online_connector, True, 1)) From 7c5bbb24104e521a70a91f2d61d89e13312de608 Mon Sep 17 00:00:00 2001 From: Victor Jouffrey <37411285+vatj@users.noreply.github.com> Date: Fri, 5 Jul 2024 11:45:58 +0200 Subject: [PATCH 06/11] [FSTORE-1462] Remove hive connection from python client (#1361) * Start removing hive related dependencies * More hive cleanups * Update ruff pre-commit hook * Start fixing pytest * REmove hive related tests * Review --- python/.pre-commit-config.yaml | 2 +- python/hsfs/connection.py | 6 +- python/hsfs/constructor/query.py | 4 - python/hsfs/core/arrow_flight_client.py | 16 ++-- python/hsfs/engine/__init__.py | 6 +- python/hsfs/engine/python.py | 91 +++---------------- python/hsfs/engine/spark_no_metastore.py | 2 +- python/hsfs/feature_group.py | 7 -- python/hsfs/feature_store.py | 2 - python/hsfs/feature_view.py | 38 +------- python/pyproject.toml | 7 -- python/tests/core/test_arrow_flight_client.py | 78 ---------------- python/tests/engine/test_python.py | 57 +----------- 13 files changed, 36 insertions(+), 280 deletions(-) diff --git a/python/.pre-commit-config.yaml b/python/.pre-commit-config.yaml index 472789f87f..98e886d9dc 100644 --- a/python/.pre-commit-config.yaml +++ b/python/.pre-commit-config.yaml @@ -1,7 +1,7 @@ exclude: setup.py repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.2 + rev: v0.5.0 hooks: - id: ruff args: [--fix] diff --git a/python/hsfs/connection.py b/python/hsfs/connection.py index 2cf8cb4a63..ce10d515f2 100644 --- a/python/hsfs/connection.py +++ b/python/hsfs/connection.py @@ -215,9 +215,9 @@ def connect(self) -> None: self._engine is None and importlib.util.find_spec("pyspark") ): self._engine = "spark" - elif ( - self._engine is not None and self._engine.lower() in ["hive", "python"] - ) or (self._engine is None and not importlib.util.find_spec("pyspark")): + elif (self._engine is not None and self._engine.lower() == "python") or ( + self._engine is None and not importlib.util.find_spec("pyspark") + ): self._engine = "python" elif self._engine is not None and self._engine.lower() == "training": self._engine = "training" diff --git a/python/hsfs/constructor/query.py b/python/hsfs/constructor/query.py index e305e8ca5a..d14a92ac05 100644 --- a/python/hsfs/constructor/query.py +++ b/python/hsfs/constructor/query.py @@ -162,12 +162,8 @@ def read( dataframe_type: DataFrame type to return. Defaults to `"default"`. read_options: Dictionary of read options for Spark in spark engine. Only for python engine: - * key `"use_hive"` and value `True` to read query with Hive instead of - [Hopsworks Feature Query Service](https://docs.hopsworks.ai/latest/setup_installation/common/arrow_flight_duckdb/). * key `"arrow_flight_config"` to pass a dictionary of arrow flight configurations. For example: `{"arrow_flight_config": {"timeout": 900}}` - * key "hive_config" to pass a dictionary of hive or tez configurations. - For example: `{"hive_config": {"hive.tez.cpu.vcores": 2, "tez.grouping.split-count": "3"}}` Defaults to `{}`. # Returns diff --git a/python/hsfs/core/arrow_flight_client.py b/python/hsfs/core/arrow_flight_client.py index f8dbb7d992..3cda7533b1 100644 --- a/python/hsfs/core/arrow_flight_client.py +++ b/python/hsfs/core/arrow_flight_client.py @@ -92,9 +92,7 @@ def _should_retry_healthcheck_or_certificate_registration(exception): def is_data_format_supported(data_format: str, read_options: Optional[Dict[str, Any]]): if data_format not in ArrowFlightClient.SUPPORTED_FORMATS: return False - elif read_options and ( - read_options.get("use_hive", False) or read_options.get("use_spark", False) - ): + elif read_options and read_options.get("use_spark", False): return False else: return get_instance()._should_be_used() @@ -122,9 +120,7 @@ def _is_query_supported_rec(query: query.Query): def is_query_supported(query: query.Query, read_options: Optional[Dict[str, Any]]): - if read_options and ( - read_options.get("use_hive", False) or read_options.get("use_spark", False) - ): + if read_options and read_options.get("use_spark", False): return False elif not _is_query_supported_rec(query): return False @@ -138,12 +134,12 @@ class ArrowFlightClient: StorageConnector.SNOWFLAKE, StorageConnector.BIGQUERY, ] - READ_ERROR = 'Could not read data using Hopsworks Feature Query Service. If the issue persists, use read_options={"use_hive": True} instead.' + READ_ERROR = "Could not read data using Hopsworks Feature Query Service." WRITE_ERROR = 'Could not write data using Hopsworks Feature Query Service. If the issue persists, use write_options={"use_spark": True} instead.' DEFAULTING_TO_DIFFERENT_SERVICE_WARNING = ( - "Defaulting to Hive/Spark execution for this call." + "Defaulting to Spark execution for this call." ) - CLIENT_WILL_STAY_ACTIVE_WARNING = 'The client will remain active for future calls. If the issue persists, use read_options={"use_hive": True} or write_options={"use_spark": True}.' + CLIENT_WILL_STAY_ACTIVE_WARNING = 'The client will remain active for future calls. If the issue persists write_options={"use_spark": True}.' DEFAULT_TIMEOUT_SECONDS = 900 DEFAULT_HEALTHCHECK_TIMEOUT_SECONDS = 5 DEFAULT_GRPC_MIN_RECONNECT_BACKOFF_MS = 2000 @@ -192,7 +188,7 @@ def __init__(self, disabled_for_session: bool = False): self._health_check() self._register_certificates() except Exception as e: - _logger.debug("Failed to connect to Hopsworks Feature Query Service") + _logger.debug("Failed to connect to Hopsworks Feature Query Service.") _logger.exception(e) warnings.warn( f"Failed to connect to Hopsworks Feature Query Service, got {str(e)}." diff --git a/python/hsfs/engine/__init__.py b/python/hsfs/engine/__init__.py index 544b2c62fa..85b636bd0e 100644 --- a/python/hsfs/engine/__init__.py +++ b/python/hsfs/engine/__init__.py @@ -33,10 +33,14 @@ def init(engine_type: str) -> None: if engine_type == "spark": _engine_type = "spark" _engine = spark.Engine() + elif engine_type == "hive": + raise ValueError( + "Hive engine is not supported in hopsworks client version >= 4.0." + ) elif engine_type == "spark-no-metastore": _engine_type = "spark-no-metastore" _engine = spark_no_metastore.Engine() - elif engine_type in ["hive", "python", "training"]: + elif engine_type in ["python", "training"]: try: from hsfs.engine import python except ImportError as err: diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index 5723a96424..dd966b366e 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -18,7 +18,6 @@ import ast import decimal import json -import logging import math import numbers import os @@ -81,7 +80,6 @@ training_dataset_api, training_dataset_job_conf, transformation_function_engine, - variable_api, ) from hsfs.core.constants import HAS_AIOMYSQL, HAS_GREAT_EXPECTATIONS, HAS_SQLALCHEMY from hsfs.core.vector_db_client import VectorDbClient @@ -89,16 +87,10 @@ from hsfs.feature_group import ExternalFeatureGroup, FeatureGroup from hsfs.training_dataset import TrainingDataset from hsfs.training_dataset_split import TrainingDatasetSplit -from pyhive import hive -from pyhive.exc import OperationalError from sqlalchemy import sql -from thrift.transport.TTransport import TTransportException from tqdm.auto import tqdm -# Disable pyhive INFO logging -logging.getLogger("pyhive").setLevel(logging.WARNING) - HAS_FAST = False try: from fastavro import schemaless_writer @@ -183,7 +175,7 @@ def sql( self, sql_query: str, feature_store: feature_store.FeatureStore, - online_conn: Optional["sc.OnlineStorageConnector"], + online_conn: Optional["sc.JdbcConnector"], dataframe_type: str, read_options: Optional[Dict[str, Any]], schema: Optional[List["feature.Feature"]] = None, @@ -191,10 +183,8 @@ def sql( if not online_conn: return self._sql_offline( sql_query, - feature_store, dataframe_type, schema, - hive_config=read_options.get("hive_config") if read_options else None, arrow_flight_config=read_options.get("arrow_flight_config", {}) if read_options else {}, @@ -224,10 +214,8 @@ def _validate_dataframe_type(self, dataframe_type: str): def _sql_offline( self, sql_query: str, - feature_store: feature_store.FeatureStore, dataframe_type: str, schema: Optional[List["feature.Feature"]] = None, - hive_config: Optional[Dict[str, Any]] = None, arrow_flight_config: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, pl.DataFrame]: self._validate_dataframe_type(dataframe_type) @@ -240,26 +228,9 @@ def _sql_offline( dataframe_type, ) else: - with self._create_hive_connection( - feature_store, hive_config=hive_config - ) as hive_conn: - # Suppress SQLAlchemy pandas warning - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - if dataframe_type.lower() == "polars": - result_df = util.run_with_loading_animation( - "Reading data from Hopsworks, using Hive", - pl.read_database, - sql_query, - hive_conn, - ) - else: - result_df = util.run_with_loading_animation( - "Reading data from Hopsworks, using Hive", - pd.read_sql, - sql_query, - hive_conn, - ) + raise ValueError( + "Reading data with Hive is not supported when using hopsworks client version >= 4.0" + ) if schema: result_df = Engine.cast_columns(result_df, schema) return self._return_dataframe_type(result_df, dataframe_type) @@ -267,13 +238,12 @@ def _sql_offline( def _jdbc( self, sql_query: str, - connector: "sc.OnlineStorageConnector", + connector: sc.JdbcConnector, dataframe_type: str, read_options: Optional[Dict[str, Any]], - schema: Optional[List["feature.Feature"]] = None, + schema: Optional[List[feature.Feature]] = None, ) -> Union[pd.DataFrame, pl.DataFrame]: self._validate_dataframe_type(dataframe_type) - if self._mysql_online_fs_engine is None: self._mysql_online_fs_engine = util_sql.create_mysql_engine( connector, @@ -296,11 +266,11 @@ def _jdbc( def read( self, - storage_connector: "sc.StorageConnector", + storage_connector: sc.StorageConnector, data_format: str, read_options: Optional[Dict[str, Any]], location: Optional[str], - dataframe_type: str, + dataframe_type: Literal["polars", "pandas", "default"], ) -> Union[pd.DataFrame, pl.DataFrame]: if not data_format: raise FeatureStoreException("data_format is not specified") @@ -351,7 +321,9 @@ def _read_pandas(self, data_format: str, obj: Any) -> pd.DataFrame: ) ) - def _read_polars(self, data_format: str, obj: Any) -> pl.DataFrame: + def _read_polars( + self, data_format: Literal["csv", "tsv", "parquet"], obj: Any + ) -> pl.DataFrame: if data_format.lower() == "csv": return pl.read_csv(obj) elif data_format.lower() == "tsv": @@ -526,7 +498,7 @@ def show( sql_query: str, feature_store: feature_store.FeatureStore, n: int, - online_conn: "sc.OnlineStorageConnector", + online_conn: "sc.JdbcConnector", read_options: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, pl.DataFrame]: return self.sql( @@ -568,8 +540,8 @@ def register_hudi_temporary_table( or hudi_fg_alias.left_feature_group_start_timestamp is not None ): raise FeatureStoreException( - "Hive engine on Python environments does not support incremental queries. " - + "Read feature group without timestamp to retrieve latest snapshot or switch to " + "Incremental queries are not supported in the python client." + + " Read feature group without timestamp to retrieve latest snapshot or switch to " + "environment with Spark Engine." ) @@ -1140,41 +1112,6 @@ def write_training_dataset( ) return td_job - def _create_hive_connection( - self, - feature_store: feature_store.FeatureStore, - hive_config: Optional[Dict[str, Any]] = None, - ) -> hive.Connection: - host = variable_api.VariableApi().get_loadbalancer_external_domain() - if host == "": - # If the load balancer is not configured, then fall back to use - # the hive server on the head node - host = client.get_instance().host - - try: - return hive.Connection( - host=host, - port=9085, - # database needs to be set every time, 'default' doesn't work in pyhive - database=feature_store, - configuration=hive_config, - auth="CERTIFICATES", - truststore=client.get_instance()._get_jks_trust_store_path(), - keystore=client.get_instance()._get_jks_key_store_path(), - keystore_password=client.get_instance()._cert_key, - ) - except (TTransportException, AttributeError) as err: - raise ValueError( - f"Cannot connect to hive server. Please check the host name '{client.get_instance()._host}' " - "is correct and make sure port '9085' is open on host server." - ) from err - except OperationalError as err: - if err.args[0].status.statusCode == 3: - raise RuntimeError( - f"Cannot access feature store '{feature_store}'. Please check if your project has the access right." - f" It is possible to request access from data owners of '{feature_store}'." - ) from err - def _return_dataframe_type( self, dataframe: Union[pd.DataFrame, pl.DataFrame], dataframe_type: str ) -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[List[Any]]]: diff --git a/python/hsfs/engine/spark_no_metastore.py b/python/hsfs/engine/spark_no_metastore.py index bea80ddca2..89505b7977 100644 --- a/python/hsfs/engine/spark_no_metastore.py +++ b/python/hsfs/engine/spark_no_metastore.py @@ -32,6 +32,6 @@ def __init__(self) -> None: super().__init__() - def _sql_offline(self, sql_query, feature_store): + def _sql_offline(self, sql_query): # Spark no metastore does not require the return self._spark_session.sql(sql_query) diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py index f490963542..b257b8edd4 100644 --- a/python/hsfs/feature_group.py +++ b/python/hsfs/feature_group.py @@ -2241,12 +2241,8 @@ def read( read_options: Additional options as key/value pairs to pass to the execution engine. For spark engine: Dictionary of read options for Spark. For python engine: - * key `"use_hive"` and value `True` to read feature group - with Hive instead of [Hopsworks Feature Query Service](https://docs.hopsworks.ai/latest/setup_installation/common/arrow_flight_duckdb/). * key `"arrow_flight_config"` to pass a dictionary of arrow flight configurations. For example: `{"arrow_flight_config": {"timeout": 900}}` - * key `"hive_config"` to pass a dictionary of hive or tez configurations. - For example: `{"hive_config": {"hive.tez.cpu.vcores": 2, "tez.grouping.split-count": "3"}}` * key `"pandas_types"` and value `True` to retrieve columns as [Pandas nullable types](https://pandas.pydata.org/docs/user_guide/integer_na.html) rather than numpy/object(string) types (experimental). @@ -2331,9 +2327,6 @@ def read_changes( `%Y-%m-%d %H:%M:%S`, or `%Y-%m-%d %H:%M:%S.%f`. read_options: Additional options as key/value pairs to pass to the execution engine. For spark engine: Dictionary of read options for Spark. - For python engine: - * key `"hive_config"` to pass a dictionary of hive or tez configurations. - For example: `{"hive_config": {"hive.tez.cpu.vcores": 2, "tez.grouping.split-count": "3"}}` Defaults to `{}`. # Returns diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index ef2d63571b..5a14a7c514 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -451,8 +451,6 @@ def sql( read_options: Additional options as key/value pairs to pass to the execution engine. For spark engine: Dictionary of read options for Spark. For python engine: - * key `"hive_config"` to pass a dictionary of hive or tez configurations. - For example: `{"hive_config": {"hive.tez.cpu.vcores": 2, "tez.grouping.split-count": "3"}}` If running queries on the online feature store, users can provide an entry `{'external': True}`, this instructs the library to use the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) to establish the connection to the online feature store. If not set, or set to False, the online feature store storage connector is used which relies on diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 1afc9569bc..e177513610 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -955,11 +955,7 @@ def get_batch_data( end_time: End event time for the batch query, exclusive. Optional. Strings should be formatted in one of the following formats `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d %H:%M:%S`, or `%Y-%m-%d %H:%M:%S.%f`. Int, i.e Unix Epoch should be in seconds. - read_options: User provided read options. - Dictionary of read options for python engine: - * key `"use_hive"` and value `True` to read batch data with Hive instead of - [Hopsworks Feature Query Service](https://docs.hopsworks.ai/latest/setup_installation/common/arrow_flight_duckdb/). - Defaults to `{}`. + read_options: User provided read options for python engine, defaults to `{}`: * key `"arrow_flight_config"` to pass a dictionary of arrow flight configurations. For example: `{"arrow_flight_config": {"timeout": 900}}` spine: Spine dataframe with primary key, event time and @@ -2137,13 +2133,8 @@ def training_data( For spark engine: Dictionary of read options for Spark. When using the `python` engine, read_options can contain the following entries: - * key `"use_hive"` and value `True` to create in-memory training dataset - with Hive instead of - [Hopsworks Feature Query Service](https://docs.hopsworks.ai/latest/setup_installation/common/arrow_flight_duckdb/). * key `"arrow_flight_config"` to pass a dictionary of arrow flight configurations. - For example: `{"arrow_flight_config": {"timeout": 900}}` - * key `"hive_config"` to pass a dictionary of hive or tez configurations. - For example: `{"hive_config": {"hive.tez.cpu.vcores": 2, "tez.grouping.split-count": "3"}}` + For example: `{"arrow_flight_config": {"timeout": 900}}`. * key `spark` and value an object of type [hsfs.core.job_configuration.JobConfiguration](../job_configuration) to configure the Hopsworks Job used to compute the training dataset. @@ -2304,13 +2295,8 @@ def train_test_split( For spark engine: Dictionary of read options for Spark. When using the `python` engine, read_options can contain the following entries: - * key `"use_hive"` and value `True` to create in-memory training dataset - with Hive instead of - [Hopsworks Feature Query Service](https://docs.hopsworks.ai/latest/setup_installation/common/arrow_flight_duckdb/). * key `"arrow_flight_config"` to pass a dictionary of arrow flight configurations. For example: `{"arrow_flight_config": {"timeout": 900}}` - * key `"hive_config"` to pass a dictionary of hive or tez configurations. - For example: `{"hive_config": {"hive.tez.cpu.vcores": 2, "tez.grouping.split-count": "3"}}` * key `spark` and value an object of type [hsfs.core.job_configuration.JobConfiguration](../job_configuration) to configure the Hopsworks Job used to compute the training dataset. @@ -2511,13 +2497,8 @@ def train_validation_test_split( For spark engine: Dictionary of read options for Spark. When using the `python` engine, read_options can contain the following entries: - * key `"use_hive"` and value `True` to create in-memory training dataset - with Hive instead of - [Hopsworks Feature Query Service](https://docs.hopsworks.ai/latest/setup_installation/common/arrow_flight_duckdb/). * key `"arrow_flight_config"` to pass a dictionary of arrow flight configurations. For example: `{"arrow_flight_config": {"timeout": 900}}` - * key `"hive_config"` to pass a dictionary of hive or tez configurations. - For example: `{"hive_config": {"hive.tez.cpu.vcores": 2, "tez.grouping.split-count": "3"}}` * key `spark` and value an object of type [hsfs.core.job_configuration.JobConfiguration](../job_configuration) to configure the Hopsworks Job used to compute the training dataset. @@ -2658,13 +2639,8 @@ def get_training_data( read_options: Additional options as key/value pairs to pass to the execution engine. For spark engine: Dictionary of read options for Spark. For python engine: - * key `"use_hive"` and value `True` to read training dataset - with the Hopsworks API instead of - [Hopsworks Feature Query Service](https://docs.hopsworks.ai/latest/setup_installation/common/arrow_flight_duckdb/). * key `"arrow_flight_config"` to pass a dictionary of arrow flight configurations. For example: `{"arrow_flight_config": {"timeout": 900}}` - * key `"hive_config"` to pass a dictionary of hive or tez configurations. - For example: `{"hive_config": {"hive.tez.cpu.vcores": 2, "tez.grouping.split-count": "3"}}` Defaults to `{}`. primary_keys: whether to include primary key features or not. Defaults to `False`, no primary key features. @@ -2729,13 +2705,8 @@ def get_train_test_split( read_options: Additional options as key/value pairs to pass to the execution engine. For spark engine: Dictionary of read options for Spark. For python engine: - * key `"use_hive"` and value `True` to read training dataset - with the Hopsworks API instead of - [Hopsworks Feature Query Service](https://docs.hopsworks.ai/latest/setup_installation/common/arrow_flight_duckdb/). * key `"arrow_flight_config"` to pass a dictionary of arrow flight configurations. For example: `{"arrow_flight_config": {"timeout": 900}}` - * key `"hive_config"` to pass a dictionary of hive or tez configurations. - For example: `{"hive_config": {"hive.tez.cpu.vcores": 2, "tez.grouping.split-count": "3"}}` Defaults to `{}`. primary_keys: whether to include primary key features or not. Defaults to `False`, no primary key features. @@ -2804,13 +2775,8 @@ def get_train_validation_test_split( read_options: Additional options as key/value pairs to pass to the execution engine. For spark engine: Dictionary of read options for Spark. For python engine: - * key `"use_hive"` and value `True` to read training dataset - with the Hopsworks API instead of - [Hopsworks Feature Query Service](https://docs.hopsworks.ai/latest/setup_installation/common/arrow_flight_duckdb/). * key `"arrow_flight_config"` to pass a dictionary of arrow flight configurations. For example: `{"arrow_flight_config": {"timeout": 900}}` - * key `"hive_config"` to pass a dictionary of hive or tez configurations. - For example: `{"hive_config": {"hive.tez.cpu.vcores": 2, "tez.grouping.split-count": "3"}}` Defaults to `{}`. primary_keys: whether to include primary key features or not. Defaults to `False`, no primary key features. diff --git a/python/pyproject.toml b/python/pyproject.toml index 499d2097ea..abb6527c22 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -50,14 +50,7 @@ dependencies = [ ] [project.optional-dependencies] -hive = [ - "pyhopshive[thrift]", - "pyarrow>=10.0", - "confluent-kafka<=2.3.0", - "fastavro>=1.4.11,<=1.8.4", -] python = [ - "pyhopshive[thrift]", "pyarrow>=10.0", "confluent-kafka<=2.3.0", "fastavro>=1.4.11,<=1.8.4", diff --git a/python/tests/core/test_arrow_flight_client.py b/python/tests/core/test_arrow_flight_client.py index 0b647aedf1..37f4523ba2 100644 --- a/python/tests/core/test_arrow_flight_client.py +++ b/python/tests/core/test_arrow_flight_client.py @@ -125,20 +125,6 @@ def test_read_feature_group(self, mocker, backend_fixtures): # Assert assert mock_read_query.call_count == 1 - def test_read_feature_group_spark(self, mocker, backend_fixtures): - # Arrange - self._arrange_engine_mocks(mocker, backend_fixtures) - fg = self._arrange_featuregroup_mocks(backend_fixtures) - mock_creat_hive_connection = mocker.patch( - "hsfs.engine.python.Engine._create_hive_connection" - ) - - # Act - fg.read(read_options={"use_hive": True}) - - # Assert - assert mock_creat_hive_connection.call_count == 1 - def test_read_query(self, mocker, backend_fixtures): # Arrange self._arrange_engine_mocks(mocker, backend_fixtures) @@ -154,21 +140,6 @@ def test_read_query(self, mocker, backend_fixtures): # Assert assert mock_read_query.call_count == 1 - def test_read_query_spark(self, mocker, backend_fixtures): - # Arrange - self._arrange_engine_mocks(mocker, backend_fixtures) - fg = self._arrange_featuregroup_mocks(backend_fixtures) - mock_creat_hive_connection = mocker.patch( - "hsfs.engine.python.Engine._create_hive_connection" - ) - query = fg.select_all() - - # Act - query.read(read_options={"use_hive": True}) - - # Assert - assert mock_creat_hive_connection.call_count == 1 - def test_training_data_featureview(self, mocker, backend_fixtures): # Arrange self._arrange_engine_mocks(mocker, backend_fixtures) @@ -183,20 +154,6 @@ def test_training_data_featureview(self, mocker, backend_fixtures): # Assert assert mock_read_query.call_count == 1 - def test_training_data_featureview_spark(self, mocker, backend_fixtures): - # Arrange - self._arrange_engine_mocks(mocker, backend_fixtures) - fv = self._arrange_featureview_mocks(mocker, backend_fixtures) - mock_creat_hive_connection = mocker.patch( - "hsfs.engine.python.Engine._create_hive_connection" - ) - - # Act - fv.training_data(read_options={"use_hive": True}) - - # Assert - assert mock_creat_hive_connection.call_count == 1 - def test_batch_data_featureview(self, mocker, backend_fixtures): # Arrange self._arrange_engine_mocks(mocker, backend_fixtures) @@ -211,20 +168,6 @@ def test_batch_data_featureview(self, mocker, backend_fixtures): # Assert assert mock_read_query.call_count == 1 - def test_batch_data_featureview_spark(self, mocker, backend_fixtures): - # Arrange - self._arrange_engine_mocks(mocker, backend_fixtures) - fv = self._arrange_featureview_mocks(mocker, backend_fixtures) - mock_creat_hive_connection = mocker.patch( - "hsfs.engine.python.Engine._create_hive_connection" - ) - - # Act - fv.get_batch_data(read_options={"use_hive": True}) - - # Assert - assert mock_creat_hive_connection.call_count == 1 - def test_get_training_data_featureview(self, mocker, backend_fixtures): # Arrange self._arrange_engine_mocks(mocker, backend_fixtures) @@ -241,27 +184,6 @@ def test_get_training_data_featureview(self, mocker, backend_fixtures): # Assert assert mock_read_path.call_count == 1 - def test_get_training_data_featureview_spark(self, mocker, backend_fixtures): - # Arrange - self._arrange_engine_mocks(mocker, backend_fixtures) - fv = self._arrange_featureview_mocks(mocker, backend_fixtures) - self._arrange_dataset_reads(mocker, backend_fixtures, "parquet") - stream = mocker.MagicMock() - stream.content = b"" - mock_read_file = mocker.patch( - "hsfs.core.dataset_api.DatasetApi.read_content", return_value=stream - ) - mock_read_pandas = mocker.patch( - "hsfs.engine.python.Engine._read_pandas", return_value=pd.DataFrame() - ) - - # Act - fv.get_training_data(1, read_options={"use_hive": True}) - - # Assert - assert mock_read_file.call_count == 1 - assert mock_read_pandas.call_count == 1 - def test_construct_query_object(self, mocker, backend_fixtures): # Arrange self._arrange_engine_mocks(mocker, backend_fixtures) diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index e6a0b1213b..a6740b71e6 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -87,42 +87,6 @@ def test_sql_online_conn(self, mocker): assert mock_python_engine_sql_offline.call_count == 0 assert mock_python_engine_jdbc.call_count == 1 - def test_sql_offline(self, mocker): - # Arrange - mock_python_engine_create_hive_connection = mocker.patch( - "hsfs.engine.python.Engine._create_hive_connection" - ) - mock_python_engine_return_dataframe_type = mocker.patch( - "hsfs.engine.python.Engine._return_dataframe_type" - ) - - python_engine = python.Engine() - - # Act - python_engine._sql_offline( - sql_query="", feature_store=None, dataframe_type="default" - ) - - # Assert - assert mock_python_engine_create_hive_connection.call_count == 1 - assert mock_python_engine_return_dataframe_type.call_count == 1 - - def test_sql_offline_dataframe_type_none(self, mocker): - # Arrange - mocker.patch("hsfs.engine.python.Engine._create_hive_connection") - - python_engine = python.Engine() - - with pytest.raises(exceptions.FeatureStoreException) as fstore_except: - # Act - python_engine._sql_offline( - sql_query="", feature_store=None, dataframe_type=None - ) - assert ( - str(fstore_except.value) - == 'dataframe_type : None not supported. Possible values are "default", "pandas", "polars", "numpy" or "python"' - ) - def test_jdbc(self, mocker): # Arrange mock_util_create_mysql_engine = mocker.patch( @@ -899,8 +863,8 @@ def test_register_hudi_temporary_table_time_travel(self): # Assert assert str(e_info.value) == ( - "Hive engine on Python environments does not support incremental queries. " - + "Read feature group without timestamp to retrieve latest snapshot or switch to " + "Incremental queries are not supported in the python client." + + " Read feature group without timestamp to retrieve latest snapshot or switch to " + "environment with Spark Engine." ) @@ -933,8 +897,8 @@ def test_register_hudi_temporary_table_time_travel_sub_query(self): # Assert assert str(e_info.value) == ( - "Hive engine on Python environments does not support incremental queries. " - + "Read feature group without timestamp to retrieve latest snapshot or switch to " + "Incremental queries are not supported in the python client." + + " Read feature group without timestamp to retrieve latest snapshot or switch to " + "environment with Spark Engine." ) @@ -3075,19 +3039,6 @@ def test_write_training_dataset_query_fv(self, mocker, backend_fixtures): assert mock_td_api.return_value.compute.call_count == 0 assert mock_job._wait_for_job.call_count == 1 - def test_create_hive_connection(self, mocker): - # Arrange - mocker.patch("hsfs.client.get_instance") - mock_pyhive_conn = mocker.patch("pyhive.hive.Connection") - - python_engine = python.Engine() - - # Act - python_engine._create_hive_connection(feature_store=None) - - # Assert - assert mock_pyhive_conn.call_count == 1 - def test_return_dataframe_type_default(self): # Arrange python_engine = python.Engine() From 4b79d3439382945d49dc92aa2fa3d9c450e9d106 Mon Sep 17 00:00:00 2001 From: davitbzh <44586065+davitbzh@users.noreply.github.com> Date: Fri, 5 Jul 2024 11:59:23 +0200 Subject: [PATCH 07/11] [FSTORE-1435] In helper columns change argument "primary_keys" to "primary_key" (#1357) * primary_keys -> primary_key * backwards compatibility * backwards compatibility * Review and fix missed foc change --------- Co-authored-by: davitbzh Co-authored-by: Victor Jouffrey Co-authored-by: Victor Jouffrey <37411285+vatj@users.noreply.github.com> --- python/hsfs/feature_view.py | 78 +++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 34 deletions(-) diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index e177513610..f78e9a7972 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -917,10 +917,11 @@ def get_batch_data( end_time: Optional[Union[str, int, datetime, date]] = None, read_options: Optional[Dict[str, Any]] = None, spine: Optional[SplineDataFrameTypes] = None, - primary_keys: bool = False, + primary_key: bool = False, event_time: bool = False, inference_helper_columns: bool = False, dataframe_type: Optional[str] = "default", + **kwargs, ) -> TrainingDatasetDataFrameTypes: """Get a batch of data from an event time interval from the offline feature store. @@ -964,7 +965,7 @@ def get_batch_data( It is possible to directly pass a spine group instead of a dataframe to overwrite the left side of the feature join, however, the same features as in the original feature group that is being replaced need to be available in the spine group. - primary_keys: whether to include primary key features or not. Defaults to `False`, no primary key + primary_key: whether to include primary key features or not. Defaults to `False`, no primary key features. event_time: whether to include event time feature or not. Defaults to `False`, no event time feature. inference_helper_columns: whether to include inference helper columns or not. @@ -994,7 +995,7 @@ def get_batch_data( self._batch_scoring_server._transformation_functions, read_options, spine, - primary_keys, + kwargs.get("primary_keys") or primary_key, event_time, inference_helper_columns, dataframe_type, @@ -1187,10 +1188,11 @@ def create_training_data( statistics_config: Optional[Union[StatisticsConfig, bool, dict]] = None, write_options: Optional[Dict[Any, Any]] = None, spine: Optional[SplineDataFrameTypes] = None, - primary_keys: bool = False, + primary_key: bool = False, event_time: bool = False, training_helper_columns: bool = False, - ) -> Tuple[int, "job.Job"]: + **kwargs, + ) -> Tuple[int, job.Job]: """Create the metadata for a training dataset and save the corresponding training data into `location`. The training data can be retrieved by calling `feature_view.get_training_data`. @@ -1353,7 +1355,7 @@ def create_training_data( It is possible to directly pass a spine group instead of a dataframe to overwrite the left side of the feature join, however, the same features as in the original feature group that is being replaced need to be available in the spine group. - primary_keys: whether to include primary key features or not. Defaults to `False`, no primary key + primary_key: whether to include primary key features or not. Defaults to `False`, no primary key features. event_time: whether to include event time feature or not. Defaults to `False`, no event time feature. training_helper_columns: whether to include training helper columns or not. Training helper columns are a @@ -1388,7 +1390,7 @@ def create_training_data( td, write_options or {}, spine=spine, - primary_keys=primary_keys, + primary_keys=kwargs.get("primary_keys") or primary_key, event_time=event_time, training_helper_columns=training_helper_columns, ) @@ -1419,10 +1421,11 @@ def create_train_test_split( statistics_config: Optional[Union[StatisticsConfig, bool, dict]] = None, write_options: Optional[Dict[Any, Any]] = None, spine: Optional[SplineDataFrameTypes] = None, - primary_keys: bool = False, + primary_key: bool = False, event_time: bool = False, training_helper_columns: bool = False, - ) -> Tuple[int, "job.Job"]: + **kwargs, + ) -> Tuple[int, job.Job]: """Create the metadata for a training dataset and save the corresponding training data into `location`. The training data is split into train and test set at random or according to time ranges. The training data can be retrieved by calling `feature_view.get_train_test_split`. @@ -1631,7 +1634,7 @@ def create_train_test_split( It is possible to directly pass a spine group instead of a dataframe to overwrite the left side of the feature join, however, the same features as in the original feature group that is being replaced need to be available in the spine group. - primary_keys: whether to include primary key features or not. Defaults to `False`, no primary key + primary_key: whether to include primary key features or not. Defaults to `False`, no primary key features. event_time: whether to include event time feature or not. Defaults to `False`, no event time feature. training_helper_columns: whether to include training helper columns or not. @@ -1675,7 +1678,7 @@ def create_train_test_split( td, write_options or {}, spine=spine, - primary_keys=primary_keys, + primary_keys=kwargs.get("primary_keys") or primary_key, event_time=event_time, training_helper_columns=training_helper_columns, ) @@ -1708,10 +1711,11 @@ def create_train_validation_test_split( statistics_config: Optional[Union[StatisticsConfig, bool, dict]] = None, write_options: Optional[Dict[Any, Any]] = None, spine: Optional[SplineDataFrameTypes] = None, - primary_keys: bool = False, + primary_key: bool = False, event_time: bool = False, training_helper_columns: bool = False, - ) -> Tuple[int, "job.Job"]: + **kwargs, + ) -> Tuple[int, job.Job]: """Create the metadata for a training dataset and save the corresponding training data into `location`. The training data is split into train, validation, and test set at random or according to time range. The training data can be retrieved by calling `feature_view.get_train_validation_test_split`. @@ -1906,7 +1910,7 @@ def create_train_validation_test_split( It is possible to directly pass a spine group instead of a dataframe to overwrite the left side of the feature join, however, the same features as in the original feature group that is being replaced need to be available in the spine group. - primary_keys: whether to include primary key features or not. Defaults to `False`, no primary key + primary_key: whether to include primary key features or not. Defaults to `False`, no primary key features. event_time: whether to include event time feature or not. Defaults to `False`, no event time feature. training_helper_columns: whether to include training helper columns or not. @@ -1958,7 +1962,7 @@ def create_train_validation_test_split( td, write_options or {}, spine=spine, - primary_keys=primary_keys, + primary_keys=kwargs.get("primary_keys") or primary_key, event_time=event_time, training_helper_columns=training_helper_columns, ) @@ -1978,7 +1982,7 @@ def recreate_training_dataset( statistics_config: Optional[Union[StatisticsConfig, bool, dict]] = None, write_options: Optional[Dict[Any, Any]] = None, spine: Optional[SplineDataFrameTypes] = None, - ) -> "job.Job": + ) -> job.Job: """ Recreate a training dataset. @@ -2056,10 +2060,11 @@ def training_data( statistics_config: Optional[Union[StatisticsConfig, bool, dict]] = None, read_options: Optional[Dict[Any, Any]] = None, spine: Optional[SplineDataFrameTypes] = None, - primary_keys: bool = False, + primary_key: bool = False, event_time: bool = False, training_helper_columns: bool = False, dataframe_type: Optional[str] = "default", + **kwargs, ) -> Tuple[ TrainingDatasetDataFrameTypes, Optional[TrainingDatasetDataFrameTypes], # optional label DataFrame @@ -2145,7 +2150,7 @@ def training_data( It is possible to directly pass a spine group instead of a dataframe to overwrite the left side of the feature join, however, the same features as in the original feature group that is being replaced need to be available in the spine group. - primary_keys: whether to include primary key features or not. Defaults to `False`, no primary key + primary_key: whether to include primary key features or not. Defaults to `False`, no primary key features. event_time: whether to include event time feature or not. Defaults to `False`, no event time feature. training_helper_columns: whether to include training helper columns or not. @@ -2180,7 +2185,7 @@ def training_data( read_options, training_dataset_obj=td, spine=spine, - primary_keys=primary_keys, + primary_keys=kwargs.get("primary_keys") or primary_key, event_time=event_time, training_helper_columns=training_helper_columns, dataframe_type=dataframe_type, @@ -2206,10 +2211,11 @@ def train_test_split( statistics_config: Optional[Union[StatisticsConfig, bool, dict]] = None, read_options: Optional[Dict[Any, Any]] = None, spine: Optional[SplineDataFrameTypes] = None, - primary_keys: bool = False, + primary_key: bool = False, event_time: bool = False, training_helper_columns: bool = False, dataframe_type: Optional[str] = "default", + **kwargs, ) -> Tuple[ TrainingDatasetDataFrameTypes, TrainingDatasetDataFrameTypes, @@ -2307,7 +2313,7 @@ def train_test_split( It is possible to directly pass a spine group instead of a dataframe to overwrite the left side of the feature join, however, the same features as in the original feature group that is being replaced need to be available in the spine group. - primary_keys: whether to include primary key features or not. Defaults to `False`, no primary key + primary_key: whether to include primary key features or not. Defaults to `False`, no primary key features. event_time: whether to include event time feature or not. Defaults to `False`, no event time feature. training_helper_columns: whether to include training helper columns or not. @@ -2351,7 +2357,7 @@ def train_test_split( training_dataset_obj=td, splits=[TrainingDatasetSplit.TRAIN, TrainingDatasetSplit.TEST], spine=spine, - primary_keys=primary_keys, + primary_keys=kwargs.get("primary_keys") or primary_key, event_time=event_time, training_helper_columns=training_helper_columns, dataframe_type=dataframe_type, @@ -2393,10 +2399,11 @@ def train_validation_test_split( statistics_config: Optional[Union[StatisticsConfig, bool, dict]] = None, read_options: Optional[Dict[Any, Any]] = None, spine: Optional[SplineDataFrameTypes] = None, - primary_keys: bool = False, + primary_key: bool = False, event_time: bool = False, training_helper_columns: bool = False, dataframe_type: Optional[str] = "default", + **kwargs, ) -> Tuple[ TrainingDatasetDataFrameTypes, TrainingDatasetDataFrameTypes, @@ -2509,7 +2516,7 @@ def train_validation_test_split( It is possible to directly pass a spine group instead of a dataframe to overwrite the left side of the feature join, however, the same features as in the original feature group that is being replaced need to be available in the spine group. - primary_keys: whether to include primary key features or not. Defaults to `False`, no primary key + primary_key: whether to include primary key features or not. Defaults to `False`, no primary key features. event_time: whether to include event time feature or not. Defaults to `False`, no event time feature. training_helper_columns: whether to include training helper columns or not. @@ -2566,7 +2573,7 @@ def train_validation_test_split( TrainingDatasetSplit.TEST, ], spine=spine, - primary_keys=primary_keys, + primary_keys=kwargs.get("primary_keys") or primary_key, event_time=event_time, training_helper_columns=training_helper_columns, dataframe_type=dataframe_type, @@ -2605,10 +2612,11 @@ def get_training_data( self, training_dataset_version: int, read_options: Optional[Dict[str, Any]] = None, - primary_keys: bool = False, + primary_key: bool = False, event_time: bool = False, training_helper_columns: bool = False, dataframe_type: Optional[str] = "default", + **kwargs, ) -> Tuple[ TrainingDatasetDataFrameTypes, Optional[TrainingDatasetDataFrameTypes], @@ -2642,7 +2650,7 @@ def get_training_data( * key `"arrow_flight_config"` to pass a dictionary of arrow flight configurations. For example: `{"arrow_flight_config": {"timeout": 900}}` Defaults to `{}`. - primary_keys: whether to include primary key features or not. Defaults to `False`, no primary key + primary_key: whether to include primary key features or not. Defaults to `False`, no primary key features. event_time: whether to include event time feature or not. Defaults to `False`, no event time feature. training_helper_columns: whether to include training helper columns or not. @@ -2661,7 +2669,7 @@ def get_training_data( self, read_options, training_dataset_version=training_dataset_version, - primary_keys=primary_keys, + primary_keys=kwargs.get("primary_keys") or primary_key, event_time=event_time, training_helper_columns=training_helper_columns, dataframe_type=dataframe_type, @@ -2674,10 +2682,11 @@ def get_train_test_split( self, training_dataset_version: int, read_options: Optional[Dict[Any, Any]] = None, - primary_keys: bool = False, + primary_key: bool = False, event_time: bool = False, training_helper_columns: bool = False, dataframe_type: Optional[str] = "default", + **kwargs, ) -> Tuple[ TrainingDatasetDataFrameTypes, TrainingDatasetDataFrameTypes, @@ -2708,7 +2717,7 @@ def get_train_test_split( * key `"arrow_flight_config"` to pass a dictionary of arrow flight configurations. For example: `{"arrow_flight_config": {"timeout": 900}}` Defaults to `{}`. - primary_keys: whether to include primary key features or not. Defaults to `False`, no primary key + primary_key: whether to include primary key features or not. Defaults to `False`, no primary key features. event_time: whether to include event time feature or not. Defaults to `False`, no event time feature. training_helper_columns: whether to include training helper columns or not. @@ -2729,7 +2738,7 @@ def get_train_test_split( read_options, training_dataset_version=training_dataset_version, splits=[TrainingDatasetSplit.TRAIN, TrainingDatasetSplit.TEST], - primary_keys=primary_keys, + primary_keys=kwargs.get("primary_keys") or primary_key, event_time=event_time, training_helper_columns=training_helper_columns, dataframe_type=dataframe_type, @@ -2742,10 +2751,11 @@ def get_train_validation_test_split( self, training_dataset_version: int, read_options: Optional[Dict[str, Any]] = None, - primary_keys: bool = False, + primary_key: bool = False, event_time: bool = False, training_helper_columns: bool = False, dataframe_type: str = "default", + **kwargs, ) -> Tuple[ TrainingDatasetDataFrameTypes, TrainingDatasetDataFrameTypes, @@ -2778,7 +2788,7 @@ def get_train_validation_test_split( * key `"arrow_flight_config"` to pass a dictionary of arrow flight configurations. For example: `{"arrow_flight_config": {"timeout": 900}}` Defaults to `{}`. - primary_keys: whether to include primary key features or not. Defaults to `False`, no primary key + primary_key: whether to include primary key features or not. Defaults to `False`, no primary key features. event_time: whether to include event time feature or not. Defaults to `False`, no event time feature. training_helper_columns: whether to include training helper columns or not. @@ -2803,7 +2813,7 @@ def get_train_validation_test_split( TrainingDatasetSplit.VALIDATION, TrainingDatasetSplit.TEST, ], - primary_keys=primary_keys, + primary_keys=kwargs.get("primary_keys") or primary_key, event_time=event_time, training_helper_columns=training_helper_columns, dataframe_type=dataframe_type, From 65a05cab29e60fd875abe5970a5d4086c6d6a065 Mon Sep 17 00:00:00 2001 From: kennethmhc Date: Fri, 5 Jul 2024 12:16:09 +0200 Subject: [PATCH 08/11] [FSTORE-1424] Feature logging (#1354) * feature logging * clean up * default write option * doc * fix * fix style * revert pd.StringDtype() * fix doc * fix doc * doc and copy df * get untransformed features * rename transformed_features * get untransformed features * return batch df * address comments * rename internal methods * fix style * add warning * fix circular import * set logging_enabled to True * fix style --- python/hsfs/core/feature_logging.py | 53 ++++++ python/hsfs/core/feature_view_api.py | 103 +++++++++++ python/hsfs/core/feature_view_engine.py | 154 +++++++++++++++- python/hsfs/core/vector_server.py | 33 +++- python/hsfs/engine/python.py | 55 ++++++ python/hsfs/engine/spark.py | 6 + python/hsfs/feature.py | 28 ++- python/hsfs/feature_group.py | 1 + python/hsfs/feature_store.py | 6 + python/hsfs/feature_view.py | 233 ++++++++++++++++++++++++ 10 files changed, 659 insertions(+), 13 deletions(-) create mode 100644 python/hsfs/core/feature_logging.py diff --git a/python/hsfs/core/feature_logging.py b/python/hsfs/core/feature_logging.py new file mode 100644 index 0000000000..b29a7317dd --- /dev/null +++ b/python/hsfs/core/feature_logging.py @@ -0,0 +1,53 @@ +import json +from typing import Any, Dict + +import humps +from hsfs import feature_group, util + + +class FeatureLogging: + + def __init__(self, id: int, + transformed_features: "feature_group.FeatureGroup", + untransformed_features: "feature_group.FeatureGroup"): + self._id = id + self._transformed_features = transformed_features + self._untransformed_features = untransformed_features + + @classmethod + def from_response_json(cls, json_dict: Dict[str, Any]) -> 'FeatureLogging': + from hsfs.feature_group import FeatureGroup # avoid circular import + json_decamelized = humps.decamelize(json_dict) + transformed_features = json_decamelized.get('transformed_log') + untransformed_features = json_decamelized.get('untransformed_log') + if transformed_features: + transformed_features = FeatureGroup.from_response_json(transformed_features) + if untransformed_features: + untransformed_features = FeatureGroup.from_response_json(untransformed_features) + return cls(json_decamelized.get('id'), transformed_features, untransformed_features) + + @property + def transformed_features(self) -> "feature_group.FeatureGroup": + return self._transformed_features + + @property + def untransformed_features(self) -> "feature_group.FeatureGroup": + return self._untransformed_features + + @property + def id(self) -> str: + return self._id + + def to_dict(self): + return { + 'id': self._id, + 'transformed_log': self._transformed_features, + 'untransformed_log': self._untransformed_features, + } + + def json(self) -> Dict[str, Any]: + return json.dumps(self, cls=util.FeatureStoreEncoder) + + def __repr__(self): + return self.json() + diff --git a/python/hsfs/core/feature_view_api.py b/python/hsfs/core/feature_view_api.py index ed5a8468c3..732160748d 100644 --- a/python/hsfs/core/feature_view_api.py +++ b/python/hsfs/core/feature_view_api.py @@ -26,6 +26,7 @@ from hsfs.client.exceptions import RestAPIError from hsfs.constructor import query, serving_prepared_statement from hsfs.core import explicit_provenance, job, training_dataset_job_conf +from hsfs.core.job import Job class FeatureViewApi: @@ -43,6 +44,13 @@ class FeatureViewApi: _COMPUTE = "compute" _PROVENANCE = "provenance" _LINKS = "links" + _LOGGING = "log" + _PAUSE_LOGGING = "pause" + _RESUME_LOGGING = "resume" + _MATERIALIZE_LOGGING = "materialize" + _TRANSFORMED_lOG = "transformed" + _UNTRANSFORMED_LOG = "untransformed" + def __init__(self, feature_store_id: int) -> None: self._feature_store_id = feature_store_id @@ -358,3 +366,98 @@ def get_models_provenance( explicit_provenance.Links.Type.MODEL, training_dataset_version=training_dataset_version, ) + + def enable_feature_logging( + self, + feature_view_name: str, + feature_view_version: int,): + _client = client.get_instance() + path_params = self._base_path + [ + feature_view_name, + self._VERSION, + feature_view_version, + self._LOGGING, + ] + _client._send_request("PUT", path_params, {}) + + def pause_feature_logging( + self, + feature_view_name: str, + feature_view_version: int,): + _client = client.get_instance() + path_params = self._base_path + [ + feature_view_name, + self._VERSION, + feature_view_version, + self._LOGGING, + self._PAUSE_LOGGING, + ] + return _client._send_request("POST", path_params, {}) + + def resume_feature_logging( + self, + feature_view_name: str, + feature_view_version: int,): + _client = client.get_instance() + path_params = self._base_path + [ + feature_view_name, + self._VERSION, + feature_view_version, + self._LOGGING, + self._RESUME_LOGGING, + ] + return _client._send_request("POST", path_params, {}) + + def materialize_feature_logging( + self, + feature_view_name: str, + feature_view_version: int,): + _client = client.get_instance() + path_params = self._base_path + [ + feature_view_name, + self._VERSION, + feature_view_version, + self._LOGGING, + self._MATERIALIZE_LOGGING, + ] + jobs_json = _client._send_request("POST", path_params, {}) + jobs = [] + if jobs_json.get("count", 0) > 1: + for item in jobs_json["items"]: + jobs.append(Job.from_response_json(item)) + else: + jobs.append(Job.from_response_json(jobs_json)) + return jobs + + def get_feature_logging( + self, + feature_view_name: str, + feature_view_version: int,): + _client = client.get_instance() + path_params = self._base_path + [ + feature_view_name, + self._VERSION, + feature_view_version, + self._LOGGING, + ] + return _client._send_request("GET", path_params, {}) + + def delete_feature_logs( + self, + feature_view_name: str, + feature_view_version: int, + transformed: bool = None, + ): + _client = client.get_instance() + path_params = self._base_path + [ + feature_view_name, + self._VERSION, + feature_view_version, + self._LOGGING, + ] + if transformed is not None: + if transformed: + path_params += [self._TRANSFORMED_lOG] + else: + path_params += [self._UNTRANSFORMED_LOG] + _client._send_request("DELETE", path_params, {}) diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py index dd49fa5e21..9926a6fc01 100644 --- a/python/hsfs/core/feature_view_engine.py +++ b/python/hsfs/core/feature_view_engine.py @@ -17,7 +17,7 @@ import datetime import warnings -from typing import Optional +from typing import Dict, Optional, Union from hsfs import ( client, @@ -29,6 +29,7 @@ ) from hsfs.client import exceptions from hsfs.client.exceptions import FeatureStoreException +from hsfs.constructor.filter import Filter, Logic from hsfs.core import ( arrow_flight_client, code_engine, @@ -39,6 +40,7 @@ training_dataset_engine, transformation_function_engine, ) +from hsfs.core.feature_logging import FeatureLogging from hsfs.training_dataset_split import TrainingDatasetSplit @@ -48,6 +50,10 @@ class FeatureViewEngine: _OVERWRITE = "overwrite" _APPEND = "append" + _LOG_TD_VERSION = "td_version" + _LOG_TIME = "log_time" + _HSML_MODEL = "hsml_model" + def __init__(self, feature_store_id): self._feature_store_id = feature_store_id @@ -742,6 +748,7 @@ def get_batch_data( event_time=False, inference_helper_columns=False, dataframe_type="default", + transformed=True, ): self._check_feature_group_accessibility(feature_view_obj) @@ -758,18 +765,23 @@ def get_batch_data( with_label=False, primary_keys=primary_keys, event_time=event_time, - inference_helper_columns=inference_helper_columns, + inference_helper_columns=inference_helper_columns or transformed, training_helper_columns=False, training_dataset_version=training_dataset_version, spine=spine, ).read(read_options=read_options, dataframe_type=dataframe_type) - if transformation_functions: + if transformation_functions and transformed: return engine.get_instance()._apply_transformation_function( transformation_functions, dataset=feature_dataframe ) else: return feature_dataframe + def transform_batch_data(self, features, transformation_functions): + return engine.get_instance()._apply_transformation_function( + transformation_functions, dataset=features, inplace=False + ) + def add_tag( self, feature_view_obj, name: str, value, training_dataset_version=None ): @@ -923,3 +935,139 @@ def _check_if_exists_with_prefix(self, f_name, f_set): ) else: return f_name + + def enable_feature_logging(self, fv): + self._feature_view_api.enable_feature_logging(fv.name, fv.version) + fv.logging_enabled = True + return fv + + def get_feature_logging(self, fv): + return FeatureLogging.from_response_json( + self._feature_view_api.get_feature_logging(fv.name, fv.version) + ) + + def _get_logging_fg(self, fv, transformed): + feature_logging = self.get_feature_logging(fv) + if transformed: + return feature_logging.transformed_features + else: + return feature_logging.untransformed_features + + def log_features(self, fv, features, prediction=None, transformed=False, write_options=None, training_dataset_version=None, hsml_model=None): + default_write_options = { + "start_offline_materialization": False, + } + if write_options: + default_write_options.update(write_options) + fg = self._get_logging_fg(fv, transformed) + df = engine.get_instance().get_feature_logging_df( + fg, + features, + [feature for feature in fv.features if not feature.label], + [feature for feature in fv.features if feature.label], + FeatureViewEngine._LOG_TD_VERSION, + FeatureViewEngine._LOG_TIME, + FeatureViewEngine._HSML_MODEL, + prediction, + training_dataset_version, + hsml_model, + ) + return fg.insert(df, write_options=default_write_options) + + def read_feature_logs(self, fv, + start_time: Optional[ + Union[str, int, datetime, datetime.date]] = None, + end_time: Optional[ + Union[str, int, datetime, datetime.date]] = None, + filter: Optional[Union[Filter, Logic]]=None, + transformed: Optional[bool]=False, + training_dataset_version=None, + hsml_model=None, + ): + fg = self._get_logging_fg(fv, transformed) + fv_feat_name_map = self._get_fv_feature_name_map(fv) + query = fg.select_all() + if start_time: + query = query.filter(fg.get_feature(FeatureViewEngine._LOG_TIME) >= start_time) + if end_time: + query = query.filter(fg.get_feature(FeatureViewEngine._LOG_TIME) <= end_time) + if training_dataset_version: + query = query.filter(fg.get_feature(FeatureViewEngine._LOG_TD_VERSION) == training_dataset_version) + if hsml_model: + query = query.filter(fg.get_feature(FeatureViewEngine._HSML_MODEL) == self.get_hsml_model_value(hsml_model)) + if filter: + query = query.filter(self._convert_to_log_fg_filter(fg, fv, filter, fv_feat_name_map)) + df = query.read() + df = df.drop(["log_id", FeatureViewEngine._LOG_TIME], axis=1) + return df + + @staticmethod + def get_hsml_model_value(hsml_model): + return f"{hsml_model.name}_{hsml_model.version}" + + def _convert_to_log_fg_filter(self, fg, fv, filter, fv_feat_name_map): + if filter is None: + return None + + if isinstance(filter, Logic): + return Logic( + filter.type, + left_f=self._convert_to_log_fg_filter(fv, filter.left_f), + right_f=self._convert_to_log_fg_filter(fv, filter.right_f), + left_l=self._convert_to_log_fg_filter(fv, filter.left_l), + right_l=self._convert_to_log_fg_filter(fv, filter.right_l), + ) + elif isinstance(filter, Filter): + fv_feature_name = fv_feat_name_map.get( + f"{filter.feature.feature_group_id}_{filter.feature.name}") + if fv_feature_name is None: + raise FeatureStoreException("Filter feature {filter.feature.name} does not exist in feature view feature.") + return Filter( + fg.get_feature(filter.feature.name), + filter.condition, + filter.value, + ) + else: + raise FeatureStoreException("Accept only Filter or Logic") + + def _get_fv_feature_name_map(self, fv) -> Dict[str, str]: + result_dict = {} + for td_feature in fv.features: + fg_feature_key = f"{td_feature.feature_group.id}_{td_feature.feature_group_feature_name}" + result_dict[fg_feature_key] = td_feature.name + return result_dict + + def get_log_timeline(self, fv, + wallclock_time: Optional[ + Union[str, int, datetime, datetime.date]] = None, + limit: Optional[int] = None, + transformed: Optional[bool]=False, + ) -> Dict[str, Dict[str, str]]: + fg = self._get_logging_fg(fv, transformed) + return fg.commit_details(wallclock_time=wallclock_time, limit=limit) + + def pause_logging(self, fv): + self._feature_view_api.pause_feature_logging( + fv.name, fv.version + ) + def resume_logging(self, fv): + self._feature_view_api.resume_feature_logging( + fv.name, fv.version + ) + + def materialize_feature_logs(self, fv, wait): + jobs = self._feature_view_api.materialize_feature_logging( + fv.name, fv.version + ) + if wait: + for job in jobs: + try: + job._wait_for_job(wait) + except Exception: + pass + return jobs + + def delete_feature_logs(self, fv, transformed): + self._feature_view_api.delete_feature_logs( + fv.name, fv.version, transformed + ) diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 9eca5dd3cd..18e6211a73 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -100,6 +100,14 @@ def __init__( or feat.training_helper_column ) ] + self._untransformed_feature_vector_col_name = [ + feat.name + for feat in features + if not ( + feat.label + or feat.training_helper_column + ) + ] self._inference_helper_col_name = [ feat.name for feat in features if feat.inference_helper_column ] @@ -250,6 +258,7 @@ def get_feature_vector( allow_missing: bool = False, force_rest_client: bool = False, force_sql_client: bool = False, + transformed=True, ) -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[Any], Dict[str, Any]]: """Assembles serving vector from online feature store.""" online_client_choice = self.which_client_and_ensure_initialised( @@ -281,10 +290,11 @@ def get_feature_vector( vector_db_result=vector_db_features or {}, allow_missing=allow_missing, client=online_client_choice, + transformed=transformed, ) return self.handle_feature_vector_return_type( - vector, batch=False, inference_helper=False, return_type=return_type + vector, batch=False, inference_helper=transformed, return_type=return_type ) def get_feature_vectors( @@ -298,6 +308,7 @@ def get_feature_vectors( allow_missing: bool = False, force_rest_client: bool = False, force_sql_client: bool = False, + transformed=True, ) -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[Any], List[Dict[str, Any]]]: """Assembles serving vector from online feature store.""" if passed_features is None: @@ -383,13 +394,14 @@ def get_feature_vectors( vector_db_result=vector_db_result, allow_missing=allow_missing, client=online_client_choice, + transformed=transformed, ) if vector is not None: vectors.append(vector) return self.handle_feature_vector_return_type( - vectors, batch=True, inference_helper=False, return_type=return_type + vectors, batch=True, inference_helper=transformed, return_type=return_type ) def assemble_feature_vector( @@ -399,6 +411,7 @@ def assemble_feature_vector( vector_db_result: Optional[Dict[str, Any]], allow_missing: bool, client: Literal["rest", "sql"], + transformed, ) -> Optional[List[Any]]: """Assembles serving vector from online feature store.""" # Errors in batch requests are returned as None values @@ -435,12 +448,24 @@ def assemble_feature_vector( if len(self.return_feature_value_handlers) > 0: self.apply_return_value_handlers(result_dict, client=client) - if len(self.transformation_functions) > 0: + if len(self.transformation_functions) > 0 and transformed: self.apply_transformation(result_dict) _logger.debug("Assembled and transformed dict feature vector: %s", result_dict) + if transformed: + return [result_dict.get(fname, None) for fname in self.feature_vector_col_name] + else: + return [result_dict.get(fname, None) for fname in self._untransformed_feature_vector_col_name] + + def transform_feature_vectors(self, batch_features): + return [self.apply_transformation(self.get_untransformed_features_map(features)) + for features in batch_features + ] - return [result_dict.get(fname, None) for fname in self.feature_vector_col_name] + def get_untransformed_features_map(self, features) -> Dict[str, Any]: + return dict( + [(fname, fvalue) for fname, fvalue + in zip(self._untransformed_feature_vector_col_name, features)]) def handle_feature_vector_return_type( self, diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index dd966b366e..4e094a377e 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -82,10 +82,12 @@ transformation_function_engine, ) from hsfs.core.constants import HAS_AIOMYSQL, HAS_GREAT_EXPECTATIONS, HAS_SQLALCHEMY +from hsfs.core.feature_view_engine import FeatureViewEngine from hsfs.core.vector_db_client import VectorDbClient from hsfs.decorators import uses_great_expectations from hsfs.feature_group import ExternalFeatureGroup, FeatureGroup from hsfs.training_dataset import TrainingDataset +from hsfs.training_dataset_feature import TrainingDatasetFeature from hsfs.training_dataset_split import TrainingDatasetSplit from sqlalchemy import sql from tqdm.auto import tqdm @@ -1198,6 +1200,7 @@ def _apply_transformation_function( str, transformation_function_attached.TransformationFunctionAttached ], dataset: Union[pd.DataFrame, pl.DataFrame], + inplace=True, ) -> Union[pd.DataFrame, pl.DataFrame]: for ( feature_name, @@ -1212,6 +1215,7 @@ def _apply_transformation_function( ) ) else: + dataset = pd.DataFrame.copy(dataset) dataset[feature_name] = dataset[feature_name].map( transformation_fn.transformation_fn ) @@ -1754,3 +1758,54 @@ def _start_offline_materialization(offline_write_options: Dict[str, Any]) -> boo return True else: return True + + @staticmethod + def _convert_feature_log_to_df(feature_log, cols): + if feature_log is None and cols: + return pd.DataFrame(columns=cols) + if not (isinstance(feature_log, (list, np.ndarray, pd.DataFrame, pl.DataFrame))): + raise ValueError(f"Type '{type(feature_log)}' not accepted") + if isinstance(feature_log, list) or isinstance(feature_log, np.ndarray): + if isinstance(feature_log[0], list) or isinstance(feature_log[0], + np.ndarray): + provided_len = len(feature_log[0]) + else: + provided_len = 1 + assert provided_len == len(cols), f"Expecting {len(cols)} features/labels but {provided_len} provided." + + return pd.DataFrame(feature_log, columns=cols) + else: + return feature_log.copy(deep=False) + + @staticmethod + def get_feature_logging_df(fg, + features, + fg_features: List[TrainingDatasetFeature], + td_predictions: List[TrainingDatasetFeature], + td_col_name, + time_col_name, + model_col_name, + prediction=None, + training_dataset_version=None, + hsml_model=None, + ) -> pd.DataFrame: + import uuid + features = Engine._convert_feature_log_to_df(features, [f.name for f in fg_features]) + if td_predictions: + prediction = Engine._convert_feature_log_to_df(prediction, [f.name for f in td_predictions]) + for f in td_predictions: + prediction[f.name] = Engine._cast_column_to_offline_type(prediction[f.name], f.type) + if not set(prediction.columns).intersection(set(features.columns)): + features = pd.concat([features, prediction], axis=1) + # need to case the column type as if it is None, type cannot be inferred. + features[td_col_name] = Engine._cast_column_to_offline_type( + pd.Series([training_dataset_version for _ in range(len(features))]), fg.get_feature(td_col_name).type + ) + # _cast_column_to_offline_type cannot cast string type + features[model_col_name] = pd.Series([FeatureViewEngine.get_hsml_model_value(hsml_model) if hsml_model else None for _ in range(len(features))], dtype=pd.StringDtype()) + now = datetime.now() + features[time_col_name] = Engine._cast_column_to_offline_type( + pd.Series([now for _ in range(len(features))]), fg.get_feature(time_col_name).type + ) + features["log_id"] = [str(uuid.uuid4()) for _ in range(len(features))] + return features[[feat.name for feat in fg.features]] diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 9142d64835..4a48b80e08 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -1410,6 +1410,12 @@ def _get_kafka_config( def is_connector_type_supported(type): return True + @staticmethod + def get_feature_logging_df(features, prediction=None): + # do not take prediction separately because spark ml framework usually return feature together with the prediction + # and it is costly to join them back + return features + class SchemaError(Exception): """Thrown when schemas don't match""" diff --git a/python/hsfs/feature.py b/python/hsfs/feature.py index 89f19b060d..8969805671 100644 --- a/python/hsfs/feature.py +++ b/python/hsfs/feature.py @@ -16,6 +16,7 @@ from __future__ import annotations import json +from datetime import datetime from typing import Any, Dict, List, Optional, Union import hsfs @@ -206,23 +207,38 @@ def default_value(self, default_value: Optional[str]) -> None: def feature_group_id(self) -> Optional[int]: return self._feature_group_id + def _get_filter_value(self, value: Any) -> Any: + if self.type == "timestamp": + return (datetime.fromtimestamp( + util.convert_event_time_to_timestamp(value)/1000) + .strftime("%Y-%m-%d %H:%M:%S") + ) + else: + return value + def __lt__(self, other: Any) -> "filter.Filter": - return filter.Filter(self, filter.Filter.LT, other) + return filter.Filter(self, filter.Filter.LT, + self._get_filter_value(other)) def __le__(self, other: Any) -> "filter.Filter": - return filter.Filter(self, filter.Filter.LE, other) + return filter.Filter(self, filter.Filter.LE, + self._get_filter_value(other)) def __eq__(self, other: Any) -> "filter.Filter": - return filter.Filter(self, filter.Filter.EQ, other) + return filter.Filter(self, filter.Filter.EQ, + self._get_filter_value(other)) def __ne__(self, other: Any) -> "filter.Filter": - return filter.Filter(self, filter.Filter.NE, other) + return filter.Filter(self, filter.Filter.NE, + self._get_filter_value(other)) def __ge__(self, other: Any) -> "filter.Filter": - return filter.Filter(self, filter.Filter.GE, other) + return filter.Filter(self, filter.Filter.GE, + self._get_filter_value(other)) def __gt__(self, other: Any) -> "filter.Filter": - return filter.Filter(self, filter.Filter.GT, other) + return filter.Filter(self, filter.Filter.GT, + self._get_filter_value(other)) def contains(self, other: Union[str, List[Any]]) -> "filter.Filter": """ diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py index b257b8edd4..605c2950e7 100644 --- a/python/hsfs/feature_group.py +++ b/python/hsfs/feature_group.py @@ -140,6 +140,7 @@ def __init__( self._notification_topic_name = notification_topic_name self._deprecated = deprecated self._feature_store_id = featurestore_id + self._feature_store = None self._variable_api: VariableApi = VariableApi() self._multi_part_insert: bool = False diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index 5a14a7c514..7d65bd9739 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -1474,6 +1474,7 @@ def create_feature_view( inference_helper_columns: Optional[List[str]] = None, training_helper_columns: Optional[List[str]] = None, transformation_functions: Optional[Dict[str, TransformationFunction]] = None, + logging_enabled: Optional[bool] = False, ) -> feature_view.FeatureView: """Create a feature view metadata object and saved it to hopsworks. @@ -1561,6 +1562,7 @@ def create_feature_view( to the features they should be applied to before writing out the vector and at inference time. Defaults to `{}`, no transformations. + logging_enabled: If true, enable feature logging for the feature view. # Returns: `FeatureView`: The feature view metadata object. @@ -1576,6 +1578,7 @@ def create_feature_view( training_helper_columns=training_helper_columns or [], transformation_functions=transformation_functions or {}, featurestore_name=self._name, + logging_enabled=logging_enabled, ) return self._feature_view_engine.save(feat_view) @@ -1590,6 +1593,7 @@ def get_or_create_feature_view( inference_helper_columns: Optional[List[str]] = None, training_helper_columns: Optional[List[str]] = None, transformation_functions: Optional[Dict[str, TransformationFunction]] = None, + logging_enabled: Optional[bool] = False, ) -> feature_view.FeatureView: """Get feature view metadata object or create a new one if it doesn't exist. This method doesn't update existing feature view metadata object. @@ -1639,6 +1643,7 @@ def get_or_create_feature_view( to the features they should be applied to before writing out the vector and at inference time. Defaults to `{}`, no transformations. + logging_enabled: If true, enable feature logging for the feature view. # Returns: `FeatureView`: The feature view metadata object. @@ -1659,6 +1664,7 @@ def get_or_create_feature_view( inference_helper_columns=inference_helper_columns or [], training_helper_columns=training_helper_columns or [], transformation_functions=transformation_functions or {}, + logging_enabled=logging_enabled, ) else: raise e diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index f78e9a7972..20e7237465 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -102,6 +102,7 @@ def __init__( ] = None, featurestore_name: Optional[str] = None, serving_keys: Optional[List[skm.ServingKey]] = None, + logging_enabled: Optional[bool] = False, **kwargs, ) -> None: self._name = name @@ -144,6 +145,7 @@ def __init__( self._statistics_engine = statistics_engine.StatisticsEngine( featurestore_id, self.ENTITY_TYPE ) + self._logging_enabled = logging_enabled if self._id: self._init_feature_monitoring_engine() @@ -463,6 +465,7 @@ def get_feature_vector( allow_missing: bool = False, force_rest_client: bool = False, force_sql_client: bool = False, + transformed: Optional[bool] = True, ) -> Union[List[Any], pd.DataFrame, np.ndarray, pl.DataFrame]: """Returns assembled feature vector from online feature store. Call [`feature_view.init_serving`](#init_serving) before this method if the following configurations are needed. @@ -536,6 +539,7 @@ def get_feature_vector( force_sql_client: boolean, defaults to False. If set to True, reads from online feature store using the SQL client if initialised. allow_missing: Setting to `True` returns feature vectors with missing values. + transformed: Setting to `False` returns the untransformed feature vectors. # Returns `list`, `pd.DataFrame`, `polars.DataFrame` or `np.ndarray` if `return type` is set to `"list"`, `"pandas"`, `"polars"` or `"numpy"` @@ -561,6 +565,7 @@ def get_feature_vector( vector_db_features=vector_db_features, force_rest_client=force_rest_client, force_sql_client=force_sql_client, + transformed=transformed, ) def get_feature_vectors( @@ -572,6 +577,7 @@ def get_feature_vectors( allow_missing: bool = False, force_rest_client: bool = False, force_sql_client: bool = False, + transformed: Optional[bool] = True, ) -> Union[List[List[Any]], pd.DataFrame, np.ndarray, pl.DataFrame]: """Returns assembled feature vectors in batches from online feature store. Call [`feature_view.init_serving`](#init_serving) before this method if the following configurations are needed. @@ -643,6 +649,7 @@ def get_feature_vectors( force_rest_client: boolean, defaults to False. If set to True, reads from online feature store using the REST client if initialised. allow_missing: Setting to `True` returns feature vectors with missing values. + transformed: Setting to `False` returns the untransformed feature vectors. # Returns `List[list]`, `pd.DataFrame`, `polars.DataFrame` or `np.ndarray` if `return type` is set to `"list", `"pandas"`,`"polars"` or `"numpy"` @@ -670,6 +677,7 @@ def get_feature_vectors( vector_db_features=vector_db_features, force_rest_client=force_rest_client, force_sql_client=force_sql_client, + transformed=transformed, ) def get_inference_helper( @@ -921,6 +929,7 @@ def get_batch_data( event_time: bool = False, inference_helper_columns: bool = False, dataframe_type: Optional[str] = "default", + transformed: Optional[bool] = True, **kwargs, ) -> TrainingDatasetDataFrameTypes: """Get a batch of data from an event time interval from the offline feature store. @@ -976,6 +985,8 @@ def get_batch_data( dataframe_type: str, optional. The type of the returned dataframe. Possible values are `"default"`, `"spark"`,`"pandas"`, `"polars"`, `"numpy"` or `"python"`. Defaults to "default", which maps to Spark dataframe for the Spark Engine and Pandas dataframe for the Python engine. + transformed: Setting to `False` returns the untransformed feature vectors. + # Returns `DataFrame`: The spark dataframe containing the feature data. `pyspark.DataFrame`. A Spark DataFrame. @@ -999,6 +1010,7 @@ def get_batch_data( event_time, inference_helper_columns, dataframe_type, + transformed=transformed, ) def add_tag(self, name: str, value: Any) -> None: @@ -3379,6 +3391,7 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView": description=json_decamelized.get("description", None), featurestore_name=json_decamelized.get("featurestore_name", None), serving_keys=serving_keys, + logging_enabled=json_decamelized.get('enabled_logging', False), ) features = json_decamelized.get("features", []) if features: @@ -3413,11 +3426,222 @@ def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureView": "training_helper_columns", "schema", "serving_keys", + "enabled_logging", ]: self._update_attribute_if_present(self, other, key) self._init_feature_monitoring_engine() return self + def transform_batch_data(self, features): + return self._feature_view_engine.transform_batch_data( + features, self._batch_scoring_server._transformation_functions + ) + + def transform_feature_vector(self, features): + return self.transform_feature_vectors([features]) + + def transform_feature_vectors(self, features): + return self._batch_scoring_server.transform_feature_vectors(features) + + def enable_logging(self) -> None: + """Enable feature logging for the current feature view. + + This method activates logging of features. + + # Example + ```python + # get feature store instance + fs = ... + + # get feature view instance + feature_view = fs.get_feature_view(...) + + # enable logging + feature_view.enable_logging() + ``` + + # Raises + `hsfs.client.exceptions.RestAPIError` in case the backend fails to enable feature logging. + """ + return self._feature_view_engine.enable_feature_logging(self) + + def log(self, + features: Union[pd.DataFrame, list[list], np.ndarray], + predictions: Optional[Union[pd.DataFrame, list[list], np.ndarray]]=None, + transformed: Optional[bool]=False, + write_options: Optional[Dict[str, Any]] = None, + training_dataset_version: Optional[int]=None, + hsml_model=None, + ): + """Log features and optionally predictions for the current feature view. + + Note: If features is a `pd.Dataframe`, prediction can be provided as columns in the dataframe. + + # Arguments + features: The features to be logged. Can be a pandas DataFrame, a list of lists, or a numpy ndarray. + prediction: The predictions to be logged. Can be a pandas DataFrame, a list of lists, or a numpy ndarray. Defaults to None. + transformed: Whether the features are transformed. Defaults to False. + write_options: Options for writing the log. Defaults to None. + training_dataset_version: Version of the training dataset. Defaults to None. + hsml_model: `hsml.model.Model` HSML model associated with the log. Defaults to None. + + # Example + ```python + # log features + feature_view.log(features) + # log features and predictions + feature_view.log(features, prediction) + ``` + + # Raises + `hsfs.client.exceptions.RestAPIError` in case the backend fails to log features. + """ + if not self.logging_enabled: + warnings.warn( + "Feature logging is not enabled. It may take longer to enable logging before logging the features. You can call `feature_view.enable_logging()` to enable logging beforehand.", + stacklevel=1, + ) + self.enable_logging() + return self._feature_view_engine.log_features( + self, features, predictions, transformed, + write_options, + training_dataset_version=( + training_dataset_version or self.get_last_accessed_training_dataset() + ), + hsml_model=hsml_model + ) + + def get_log_timeline(self, + wallclock_time: Optional[ + Union[str, int, datetime, datetime.date]] = None, + limit: Optional[int] = None, + transformed: Optional[bool] = False, + ): + """Retrieve the log timeline for the current feature view. + + # Arguments + wallclock_time: Specific time to get the log timeline for. Can be a string, integer, datetime, or date. Defaults to None. + limit: Maximum number of entries to retrieve. Defaults to None. + transformed: Whether to include transformed logs. Defaults to False. + + # Example + ```python + # get log timeline + log_timeline = feature_view.get_log_timeline(limit=10) + ``` + + # Raises + `hsfs.client.exceptions.RestAPIError` in case the backend fails to retrieve the log timeline. + """ + return self._feature_view_engine.get_log_timeline( + self, wallclock_time, limit, transformed + ) + + def read_log(self, + start_time: Optional[ + Union[str, int, datetime, datetime.date]] = None, + end_time: Optional[ + Union[str, int, datetime, datetime.date]] = None, + filter: Optional[Union[Filter, Logic]] = None, + transformed: Optional[bool] = False, + training_dataset_version: Optional[int]=None, + hsml_model=None, + ): + """Read the log entries for the current feature view. + Optionally, filter can be applied to start/end time, training dataset version, hsml model, + and custom fitler. + + # Arguments + start_time: Start time for the log entries. Can be a string, integer, datetime, or date. Defaults to None. + end_time: End time for the log entries. Can be a string, integer, datetime, or date. Defaults to None. + filter: Filter to apply on the log entries. Can be a Filter or Logic object. Defaults to None. + transformed: Whether to include transformed logs. Defaults to False. + training_dataset_version: Version of the training dataset. Defaults to None. + hsml_model: HSML model associated with the log. Defaults to None. + + # Example + ```python + # read all log entries + log_entries = feature_view.read_log() + # read log entries within time ranges + log_entries = feature_view.read_log(start_time="2022-01-01", end_time="2022-01-31") + # read log entries of a specific training dataset version + log_entries = feature_view.read_log(training_dataset_version=1) + # read log entries of a specific hsml model + log_entries = feature_view.read_log(hsml_model=Model(1, "dummy", version=1)) + # read log entries by applying filter on features of feature group `fg` in the feature view + log_entries = feature_view.read_log(filter=fg.feature1 > 10) + ``` + + # Raises + `hsfs.client.exceptions.RestAPIError` in case the backend fails to read the log entries. + """ + return self._feature_view_engine.read_feature_logs( + self, start_time, end_time, filter, transformed, training_dataset_version, hsml_model + ) + + def pause_logging(self): + """Pause scheduled materialization job for the current feature view. + + # Example + ```python + # pause logging + feature_view.pause_logging() + ``` + + # Raises + `hsfs.client.exceptions.RestAPIError` in case the backend fails to pause feature logging. + """ + self._feature_view_engine.pause_logging(self) + + def resume_logging(self): + """Resume scheduled materialization job for the current feature view. + + # Example + ```python + # resume logging + feature_view.resume_logging() + ``` + + # Raises + `hsfs.client.exceptions.RestAPIError` in case the backend fails to pause feature logging. + """ + self._feature_view_engine.resume_logging(self) + + def materialize_log(self, wait: Optional[bool]=False): + """Materialize the log for the current feature view. + + # Arguments + wait: Whether to wait for the materialization to complete. Defaults to False. + + # Example + ```python + # materialize log + materialization_result = feature_view.materialize_log(wait=True) + ``` + + # Raises + `hsfs.client.exceptions.RestAPIError` in case the backend fails to materialize the log. + """ + return self._feature_view_engine.materialize_feature_logs(self, wait) + + def delete_log(self, transformed: Optional[bool]=None): + """Delete the logged feature data for the current feature view. + + # Arguments + transformed: Whether to delete transformed logs. Defaults to None. Delete both transformed and untransformed logs. + + # Example + ```python + # delete log + feature_view.delete_log() + ``` + + # Raises + `hsfs.client.exceptions.RestAPIError` in case the backend fails to delete the log. + """ + return self._feature_view_engine.delete_feature_logs(self, transformed) + @staticmethod def _update_attribute_if_present(this: "FeatureView", new: Any, key: str) -> None: if getattr(new, key): @@ -3450,6 +3674,7 @@ def to_dict(self) -> Dict[str, Any]: "description": self._description, "query": self._query, "features": self._features, + "enabledLogging": self._logging_enabled, "type": "featureViewDTO", } @@ -3619,3 +3844,11 @@ def serving_keys(self) -> List[skm.ServingKey]: @serving_keys.setter def serving_keys(self, serving_keys: List[skm.ServingKey]) -> None: self._serving_keys = serving_keys + + @property + def logging_enabled(self) -> bool: + return self._logging_enabled + + @logging_enabled.setter + def logging_enabled(self, logging_enabled) -> None: + self._logging_enabled = logging_enabled From 834d13ff1c0b21c6b852ea4fc60397901124cd01 Mon Sep 17 00:00:00 2001 From: Victor Jouffrey <37411285+vatj@users.noreply.github.com> Date: Mon, 8 Jul 2024 12:48:58 +0200 Subject: [PATCH 09/11] [FSTORE-1461] Refactor Kafka out of python engine (#1359) * Start Kafka refactor * Continue refactor of kafka/avro out of python engine * Minor tidy up of args * Tidy up python engine * Minor refactoring of multi_part_insert and kafka headers * Options to get_kafka_config with spark options * Fix mocking * Move and fix kafka tests * More fixing of mocking * More work on unit test fixing * Fix with import reload * Fix mocking in test_feature_group_writer * Fix mocking in engine/test_python --------- Co-authored-by: Aleksey Veresov --- python/hsfs/core/constants.py | 13 + python/hsfs/core/kafka_engine.py | 248 ++++++++++ python/hsfs/engine/python.py | 254 ++--------- python/hsfs/engine/spark.py | 62 ++- python/hsfs/feature_group.py | 2 + python/tests/core/test_kafka_engine.py | 527 ++++++++++++++++++++++ python/tests/engine/test_python.py | 505 ++------------------- python/tests/engine/test_python_writer.py | 14 +- python/tests/engine/test_spark.py | 184 -------- python/tests/test_feature_group_writer.py | 26 +- 10 files changed, 909 insertions(+), 926 deletions(-) create mode 100644 python/hsfs/core/kafka_engine.py create mode 100644 python/tests/core/test_kafka_engine.py diff --git a/python/hsfs/core/constants.py b/python/hsfs/core/constants.py index c2cbd33a8b..d6af380185 100644 --- a/python/hsfs/core/constants.py +++ b/python/hsfs/core/constants.py @@ -1,6 +1,19 @@ import importlib.util +# Avro +HAS_FAST_AVRO: bool = importlib.util.find_spec("fastavro") is not None +HAS_AVRO: bool = importlib.util.find_spec("avro") is not None + +# Confluent Kafka +HAS_CONFLUENT_KAFKA: bool = importlib.util.find_spec("confluent_kafka") is not None +confluent_kafka_not_installed_message = ( + "Confluent Kafka package not found. " + "If you want to use Kafka with Hopsworks you can install the corresponding extras " + """`pip install hopsworks[python]` or `pip install "hopsworks[python]"` if using zsh. """ + "You can also install confluent-kafka directly in your environment e.g `pip install confluent-kafka`. " + "You will need to restart your kernel if applicable." +) # Data Validation / Great Expectations HAS_GREAT_EXPECTATIONS: bool = ( importlib.util.find_spec("great_expectations") is not None diff --git a/python/hsfs/core/kafka_engine.py b/python/hsfs/core/kafka_engine.py new file mode 100644 index 0000000000..ca1a50f0a5 --- /dev/null +++ b/python/hsfs/core/kafka_engine.py @@ -0,0 +1,248 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from __future__ import annotations + +import json +from io import BytesIO +from typing import TYPE_CHECKING, Any, Callable, Dict, Literal, Optional, Tuple, Union + +from hsfs import client +from hsfs.client import hopsworks +from hsfs.core import storage_connector_api +from hsfs.core.constants import HAS_AVRO, HAS_CONFLUENT_KAFKA, HAS_FAST_AVRO +from tqdm import tqdm + + +if HAS_CONFLUENT_KAFKA: + from confluent_kafka import Consumer, KafkaError, Producer, TopicPartition + +if HAS_FAST_AVRO: + from fastavro import schemaless_writer + from fastavro.schema import parse_schema +elif HAS_AVRO: + import avro.io + import avro.schema + + +if TYPE_CHECKING: + from hsfs.feature_group import ExternalFeatureGroup, FeatureGroup + + +def init_kafka_consumer( + feature_store_id: int, + offline_write_options: Dict[str, Any], +) -> Consumer: + # setup kafka consumer + consumer_config = get_kafka_config(feature_store_id, offline_write_options) + if "group.id" not in consumer_config: + consumer_config["group.id"] = "hsfs_consumer_group" + + return Consumer(consumer_config) + + +def init_kafka_resources( + feature_group: Union[FeatureGroup, ExternalFeatureGroup], + offline_write_options: Dict[str, Any], + project_id: int, +) -> Tuple[ + Producer, Dict[str, bytes], Dict[str, Callable[..., bytes]], Callable[..., bytes] : +]: + # this function is a caching wrapper around _init_kafka_resources + if feature_group._multi_part_insert and feature_group._kafka_producer: + return ( + feature_group._kafka_producer, + feature_group._kafka_headers, + feature_group._feature_writers, + feature_group._writer, + ) + producer, headers, feature_writers, writer = _init_kafka_resources( + feature_group, offline_write_options, project_id + ) + if feature_group._multi_part_insert: + feature_group._kafka_producer = producer + feature_group._kafka_headers = headers + feature_group._feature_writers = feature_writers + feature_group._writer = writer + return producer, headers, feature_writers, writer + + +def _init_kafka_resources( + feature_group: Union[FeatureGroup, ExternalFeatureGroup], + offline_write_options: Dict[str, Any], + project_id: int, +) -> Tuple[ + Producer, Dict[str, bytes], Dict[str, Callable[..., bytes]], Callable[..., bytes] : +]: + # setup kafka producer + producer = init_kafka_producer( + feature_group.feature_store_id, offline_write_options + ) + # setup complex feature writers + feature_writers = { + feature: get_encoder_func(feature_group._get_feature_avro_schema(feature)) + for feature in feature_group.get_complex_features() + } + # setup row writer function + writer = get_encoder_func(feature_group._get_encoded_avro_schema()) + + # custom headers for hopsworks onlineFS + headers = { + "projectId": str(project_id).encode("utf8"), + "featureGroupId": str(feature_group._id).encode("utf8"), + "subjectId": str(feature_group.subject["id"]).encode("utf8"), + } + return producer, headers, feature_writers, writer + + +def init_kafka_producer( + feature_store_id: int, + offline_write_options: Dict[str, Any], +) -> Producer: + # setup kafka producer + return Producer(get_kafka_config(feature_store_id, offline_write_options)) + + +def kafka_get_offsets( + topic_name: str, + feature_store_id: int, + offline_write_options: Dict[str, Any], + high: bool, +) -> str: + consumer = init_kafka_consumer(feature_store_id, offline_write_options) + topics = consumer.list_topics( + timeout=offline_write_options.get("kafka_timeout", 6) + ).topics + if topic_name in topics.keys(): + # topic exists + offsets = "" + tuple_value = int(high) + for partition_metadata in topics.get(topic_name).partitions.values(): + partition = TopicPartition( + topic=topic_name, partition=partition_metadata.id + ) + offsets += f",{partition_metadata.id}:{consumer.get_watermark_offsets(partition)[tuple_value]}" + consumer.close() + + return f" -initialCheckPointString {topic_name + offsets}" + return "" + + +def kafka_produce( + producer: Producer, + key: str, + encoded_row: bytes, + topic_name: str, + headers: Dict[str, bytes], + acked: callable, + debug_kafka: bool = False, +) -> None: + while True: + # if BufferError is thrown, we can be sure, message hasn't been send so we retry + try: + # produce + producer.produce( + topic=topic_name, + key=key, + value=encoded_row, + callback=acked, + headers=headers, + ) + + # Trigger internal callbacks to empty op queue + producer.poll(0) + break + except BufferError as e: + if debug_kafka: + print("Caught: {}".format(e)) + # backoff for 1 second + producer.poll(1) + + +def encode_complex_features( + feature_writers: Dict[str, callable], row: Dict[str, Any] +) -> Dict[str, Any]: + for feature_name, writer in feature_writers.items(): + with BytesIO() as outf: + writer(row[feature_name], outf) + row[feature_name] = outf.getvalue() + return row + + +def get_encoder_func(writer_schema: str) -> callable: + if HAS_FAST_AVRO: + schema = json.loads(writer_schema) + parsed_schema = parse_schema(schema) + return lambda record, outf: schemaless_writer(outf, parsed_schema, record) + + parsed_schema = avro.schema.parse(writer_schema) + writer = avro.io.DatumWriter(parsed_schema) + return lambda record, outf: writer.write(record, avro.io.BinaryEncoder(outf)) + + +def get_kafka_config( + feature_store_id: int, + write_options: Optional[Dict[str, Any]] = None, + engine: Literal["spark", "confluent"] = "confluent", +) -> Dict[str, Any]: + if write_options is None: + write_options = {} + external = not ( + isinstance(client.get_instance(), hopsworks.Client) + or write_options.get("internal_kafka", False) + ) + + storage_connector = storage_connector_api.StorageConnectorApi().get_kafka_connector( + feature_store_id, external + ) + + if engine == "spark": + config = storage_connector.spark_options() + config.update(write_options) + elif engine == "confluent": + config = storage_connector.confluent_options() + config.update(write_options.get("kafka_producer_config", {})) + return config + + +def build_ack_callback_and_optional_progress_bar( + n_rows: int, is_multi_part_insert: bool, offline_write_options: Dict[str, Any] +) -> Tuple[Callable, Optional[tqdm]]: + if not is_multi_part_insert: + progress_bar = tqdm( + total=n_rows, + bar_format="{desc}: {percentage:.2f}% |{bar}| Rows {n_fmt}/{total_fmt} | " + "Elapsed Time: {elapsed} | Remaining Time: {remaining}", + desc="Uploading Dataframe", + mininterval=1, + ) + else: + progress_bar = None + + def acked(err: Exception, msg: Any) -> None: + if err is not None: + if offline_write_options.get("debug_kafka", False): + print("Failed to deliver message: %s: %s" % (str(msg), str(err))) + if err.code() in [ + KafkaError.TOPIC_AUTHORIZATION_FAILED, + KafkaError._MSG_TIMED_OUT, + ]: + progress_bar.colour = "RED" + raise err # Stop producing and show error + # update progress bar for each msg + if not is_multi_part_insert: + progress_bar.update() + + return acked, progress_bar diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index 4e094a377e..5e2b9699e8 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -32,7 +32,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Dict, List, Literal, @@ -45,7 +44,6 @@ if TYPE_CHECKING: import great_expectations -import avro import boto3 import hsfs import numpy as np @@ -54,7 +52,6 @@ import pyarrow as pa import pytz from botocore.response import StreamingBody -from confluent_kafka import Consumer, KafkaError, Producer, TopicPartition from hsfs import ( client, feature, @@ -64,7 +61,6 @@ util, ) from hsfs import storage_connector as sc -from hsfs.client import hopsworks from hsfs.client.exceptions import FeatureStoreException from hsfs.constructor import query from hsfs.core import ( @@ -75,6 +71,7 @@ ingestion_job_conf, job, job_api, + kafka_engine, statistics_api, storage_connector_api, training_dataset_api, @@ -89,19 +86,8 @@ from hsfs.training_dataset import TrainingDataset from hsfs.training_dataset_feature import TrainingDatasetFeature from hsfs.training_dataset_split import TrainingDatasetSplit -from sqlalchemy import sql -from tqdm.auto import tqdm -HAS_FAST = False -try: - from fastavro import schemaless_writer - from fastavro.schema import parse_schema - - HAS_FAST = True -except ImportError: - pass - if HAS_GREAT_EXPECTATIONS: import great_expectations @@ -1239,52 +1225,6 @@ def get_unique_values( ) -> np.ndarray: return feature_dataframe[feature_name].unique() - def _init_kafka_producer( - self, - feature_group: Union[FeatureGroup, ExternalFeatureGroup], - offline_write_options: Dict[str, Any], - ) -> Producer: - # setup kafka producer - return Producer( - self._get_kafka_config( - feature_group.feature_store_id, offline_write_options - ) - ) - - def _init_kafka_consumer( - self, - feature_group: Union[FeatureGroup, ExternalFeatureGroup], - offline_write_options: Dict[str, Any], - ) -> Consumer: - # setup kafka consumer - consumer_config = self._get_kafka_config( - feature_group.feature_store_id, offline_write_options - ) - if "group.id" not in consumer_config: - consumer_config["group.id"] = "hsfs_consumer_group" - - return Consumer(consumer_config) - - def _init_kafka_resources( - self, - feature_group: Union[FeatureGroup, ExternalFeatureGroup], - offline_write_options: Dict[str, Any], - ) -> Tuple[Producer, Dict[str, Callable], Callable]: - # setup kafka producer - producer = self._init_kafka_producer(feature_group, offline_write_options) - - # setup complex feature writers - feature_writers = { - feature: self._get_encoder_func( - feature_group._get_feature_avro_schema(feature) - ) - for feature in feature_group.get_complex_features() - } - - # setup row writer function - writer = self._get_encoder_func(feature_group._get_encoded_avro_schema()) - return producer, feature_writers, writer - def _write_dataframe_kafka( self, feature_group: Union[FeatureGroup, ExternalFeatureGroup], @@ -1292,50 +1232,25 @@ def _write_dataframe_kafka( offline_write_options: Dict[str, Any], ) -> Optional[job.Job]: initial_check_point = "" - if feature_group._multi_part_insert: - if feature_group._kafka_producer is None: - producer, feature_writers, writer = self._init_kafka_resources( - feature_group, offline_write_options - ) - feature_group._kafka_producer = producer - feature_group._feature_writers = feature_writers - feature_group._writer = writer - else: - producer = feature_group._kafka_producer - feature_writers = feature_group._feature_writers - writer = feature_group._writer - else: - producer, feature_writers, writer = self._init_kafka_resources( - feature_group, offline_write_options - ) - - # initialize progress bar - progress_bar = tqdm( - total=dataframe.shape[0], - bar_format="{desc}: {percentage:.2f}% |{bar}| Rows {n_fmt}/{total_fmt} | " - "Elapsed Time: {elapsed} | Remaining Time: {remaining}", - desc="Uploading Dataframe", - mininterval=1, - ) - + producer, headers, feature_writers, writer = kafka_engine.init_kafka_resources( + feature_group, + offline_write_options, + project_id=client.get_instance().project_id, + ) + if not feature_group._multi_part_insert: # set initial_check_point to the current offset - initial_check_point = self._kafka_get_offsets( - feature_group, offline_write_options, True + initial_check_point = kafka_engine.kafka_get_offsets( + topic_name=feature_group._online_topic_name, + feature_store_id=feature_group.feature_store_id, + offline_write_options=offline_write_options, + high=True, ) - def acked(err: Exception, msg: Any) -> None: - if err is not None: - if offline_write_options.get("debug_kafka", False): - print("Failed to deliver message: %s: %s" % (str(msg), str(err))) - if err.code() in [ - KafkaError.TOPIC_AUTHORIZATION_FAILED, - KafkaError._MSG_TIMED_OUT, - ]: - progress_bar.colour = "RED" - raise err # Stop producing and show error - # update progress bar for each msg - if not feature_group._multi_part_insert: - progress_bar.update() + acked, progress_bar = kafka_engine.build_ack_callback_and_optional_progress_bar( + n_rows=dataframe.shape[0], + is_multi_part_insert=feature_group._multi_part_insert, + offline_write_options=offline_write_options, + ) if isinstance(dataframe, pd.DataFrame): row_iterator = dataframe.itertuples(index=False) @@ -1365,7 +1280,7 @@ def acked(err: Exception, msg: Any) -> None: row[k] = None # encode complex features - row = self._encode_complex_features(feature_writers, row) + row = kafka_engine.encode_complex_features(feature_writers, row) # encode feature row with BytesIO() as outf: @@ -1375,8 +1290,14 @@ def acked(err: Exception, msg: Any) -> None: # assemble key key = "".join([str(row[pk]) for pk in sorted(feature_group.primary_key)]) - self._kafka_produce( - producer, feature_group, key, encoded_row, acked, offline_write_options + kafka_engine.kafka_produce( + producer=producer, + key=key, + encoded_row=encoded_row, + topic_name=feature_group._online_topic_name, + headers=headers, + acked=acked, + debug_kafka=offline_write_options.get("debug_kafka", False), ) # make sure producer blocks and everything is delivered @@ -1384,13 +1305,11 @@ def acked(err: Exception, msg: Any) -> None: producer.flush() progress_bar.close() - # start materialization job + # start materialization job if not an external feature group, otherwise return None + if isinstance(feature_group, ExternalFeatureGroup): + return None # if topic didn't exist, always run the materialization job to reset the offsets except if it's a multi insert - if ( - not isinstance(feature_group, ExternalFeatureGroup) - and not initial_check_point - and not feature_group._multi_part_insert - ): + if not initial_check_point and not feature_group._multi_part_insert: if self._start_offline_materialization(offline_write_options): warnings.warn( "This is the first ingestion after an upgrade or backup/restore, running materialization job even though `start_offline_materialization` was set to `False`.", @@ -1398,17 +1317,18 @@ def acked(err: Exception, msg: Any) -> None: stacklevel=1, ) # set the initial_check_point to the lowest offset (it was not set previously due to topic not existing) - initial_check_point = self._kafka_get_offsets( - feature_group, offline_write_options, False + initial_check_point = kafka_engine.kafka_get_offsets( + topic_name=feature_group._online_topic_name, + feature_store_id=feature_group.feature_store_id, + offline_write_options=offline_write_options, + high=True, ) feature_group.materialization_job.run( args=feature_group.materialization_job.config.get("defaultArgs", "") + initial_check_point, await_termination=offline_write_options.get("wait_for_job", False), ) - elif not isinstance( - feature_group, ExternalFeatureGroup - ) and self._start_offline_materialization(offline_write_options): + elif self._start_offline_materialization(offline_write_options): if not offline_write_options.get( "skip_offsets", False ) and self._job_api.last_execution( @@ -1422,110 +1342,8 @@ def acked(err: Exception, msg: Any) -> None: + initial_check_point, await_termination=offline_write_options.get("wait_for_job", False), ) - if isinstance(feature_group, ExternalFeatureGroup): - return None return feature_group.materialization_job - def _kafka_get_offsets( - self, - feature_group: Union[FeatureGroup, ExternalFeatureGroup], - offline_write_options: Dict[str, Any], - high: bool, - ) -> str: - topic_name = feature_group._online_topic_name - consumer = self._init_kafka_consumer(feature_group, offline_write_options) - topics = consumer.list_topics( - timeout=offline_write_options.get("kafka_timeout", 6) - ).topics - if topic_name in topics.keys(): - # topic exists - offsets = "" - tuple_value = int(high) - for partition_metadata in topics.get(topic_name).partitions.values(): - partition = TopicPartition( - topic=topic_name, partition=partition_metadata.id - ) - offsets += f",{partition_metadata.id}:{consumer.get_watermark_offsets(partition)[tuple_value]}" - consumer.close() - - return f" -initialCheckPointString {topic_name + offsets}" - return "" - - def _kafka_produce( - self, - producer: Producer, - feature_group: Union[FeatureGroup, ExternalFeatureGroup], - key: str, - encoded_row: bytes, - acked: callable, - offline_write_options: Dict[str, Any], - ) -> None: - while True: - # if BufferError is thrown, we can be sure, message hasn't been send so we retry - try: - # produce - header = { - "projectId": str(feature_group.feature_store.project_id).encode( - "utf8" - ), - "featureGroupId": str(feature_group._id).encode("utf8"), - "subjectId": str(feature_group.subject["id"]).encode("utf8"), - } - - producer.produce( - topic=feature_group._online_topic_name, - key=key, - value=encoded_row, - callback=acked, - headers=header, - ) - - # Trigger internal callbacks to empty op queue - producer.poll(0) - break - except BufferError as e: - if offline_write_options.get("debug_kafka", False): - print("Caught: {}".format(e)) - # backoff for 1 second - producer.poll(1) - - def _encode_complex_features( - self, feature_writers: Dict[str, callable], row: Dict[str, Any] - ) -> Dict[str, Any]: - for feature_name, writer in feature_writers.items(): - with BytesIO() as outf: - writer(row[feature_name], outf) - row[feature_name] = outf.getvalue() - return row - - def _get_encoder_func(self, writer_schema: str) -> callable: - if HAS_FAST: - schema = json.loads(writer_schema) - parsed_schema = parse_schema(schema) - return lambda record, outf: schemaless_writer(outf, parsed_schema, record) - - parsed_schema = avro.schema.parse(writer_schema) - writer = avro.io.DatumWriter(parsed_schema) - return lambda record, outf: writer.write(record, avro.io.BinaryEncoder(outf)) - - def _get_kafka_config( - self, feature_store_id: int, write_options: Optional[Dict[str, Any]] = None - ) -> Dict[str, Any]: - if write_options is None: - write_options = {} - external = not ( - isinstance(client.get_instance(), hopsworks.Client) - or write_options.get("internal_kafka", False) - ) - - storage_connector = self._storage_connector_api.get_kafka_connector( - feature_store_id, external - ) - - config = storage_connector.confluent_options() - config.update(write_options.get("kafka_producer_config", {})) - return config - @staticmethod def _convert_pandas_dtype_to_offline_type(arrow_type: str) -> str: # This is a simple type conversion between pandas dtypes and pyspark (hive) types, diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 4a48b80e08..1a9fcd3872 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -23,13 +23,14 @@ import shutil import warnings from datetime import date, datetime, timezone -from typing import TYPE_CHECKING, Any, List, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union if TYPE_CHECKING: import great_expectations + from pyspark.rdd import RDD + from pyspark.sql import DataFrame -import avro import numpy as np import pandas as pd import tzlocal @@ -82,17 +83,16 @@ def iteritems(self): from hsfs import client, feature, training_dataset_feature, util from hsfs import feature_group as fg_mod -from hsfs.client import hopsworks from hsfs.client.exceptions import FeatureStoreException from hsfs.constructor import query from hsfs.core import ( dataset_api, delta_engine, hudi_engine, - storage_connector_api, + kafka_engine, transformation_function_engine, ) -from hsfs.core.constants import HAS_GREAT_EXPECTATIONS +from hsfs.core.constants import HAS_AVRO, HAS_GREAT_EXPECTATIONS from hsfs.decorators import uses_great_expectations from hsfs.storage_connector import StorageConnector from hsfs.training_dataset_split import TrainingDatasetSplit @@ -101,6 +101,9 @@ def iteritems(self): if HAS_GREAT_EXPECTATIONS: import great_expectations +if HAS_AVRO: + import avro + class Engine: HIVE_FORMAT = "hive" @@ -123,7 +126,6 @@ def __init__(self): if importlib.util.find_spec("pydoop"): # If we are on Databricks don't setup Pydoop as it's not available and cannot be easily installed. util.setup_pydoop() - self._storage_connector_api = storage_connector_api.StorageConnectorApi() self._dataset_api = dataset_api.DatasetApi() def sql( @@ -382,17 +384,17 @@ def save_dataframe( def save_stream_dataframe( self, - feature_group, + feature_group: Union[fg_mod.FeatureGroup, fg_mod.ExternalFeatureGroup], dataframe, query_name, output_mode, - await_termination, + await_termination: bool, timeout, - checkpoint_dir, - write_options, + checkpoint_dir: Optional[str], + write_options: Optional[Dict[str, Any]], ): - write_options = self._get_kafka_config( - feature_group.feature_store_id, write_options + write_options = kafka_engine.get_kafka_config( + feature_group.feature_store_id, write_options, engine="spark" ) serialized_df = self._online_fg_to_avro( feature_group, self._encode_complex_features(feature_group, dataframe) @@ -485,8 +487,8 @@ def _save_offline_dataframe( ).saveAsTable(feature_group._get_table_name()) def _save_online_dataframe(self, feature_group, dataframe, write_options): - write_options = self._get_kafka_config( - feature_group.feature_store_id, write_options + write_options = kafka_engine.get_kafka_config( + feature_group.feature_store_id, write_options, engine="spark" ) serialized_df = self._online_fg_to_avro( @@ -511,7 +513,11 @@ def _save_online_dataframe(self, feature_group, dataframe, write_options): "topic", feature_group._online_topic_name ).save() - def _encode_complex_features(self, feature_group, dataframe): + def _encode_complex_features( + self, + feature_group: Union[fg_mod.FeatureGroup, fg_mod.ExternalFeatureGroup], + dataframe: Union[RDD, DataFrame], + ): """Encodes all complex type features to binary using their avro type as schema.""" return dataframe.select( [ @@ -524,7 +530,11 @@ def _encode_complex_features(self, feature_group, dataframe): ] ) - def _online_fg_to_avro(self, feature_group, dataframe): + def _online_fg_to_avro( + self, + feature_group: Union[fg_mod.FeatureGroup, fg_mod.ExternalFeatureGroup], + dataframe: Union[DataFrame, RDD], + ): """Packs all features into named struct to be serialized to single avro/binary column. And packs primary key into arry to be serialized for partitioning. """ @@ -976,7 +986,7 @@ def profile( @uses_great_expectations def validate_with_great_expectations( self, - dataframe: TypeVar("pyspark.sql.DataFrame"), # noqa: F821 + dataframe: DataFrame, # noqa: F821 expectation_suite: great_expectations.core.ExpectationSuite, # noqa: F821 ge_validate_kwargs: Optional[dict], ): @@ -1388,24 +1398,6 @@ def cast_columns(df, schema, online=False): df = df.withColumn(_feat, col(_feat).cast(pyspark_schema[_feat])) return df - def _get_kafka_config( - self, feature_store_id: int, write_options: dict = None - ) -> dict: - if write_options is None: - write_options = {} - external = not ( - isinstance(client.get_instance(), hopsworks.Client) - or write_options.get("internal_kafka", False) - ) - - storage_connector = self._storage_connector_api.get_kafka_connector( - feature_store_id, external - ) - - config = storage_connector.spark_options() - config.update(write_options) - return config - @staticmethod def is_connector_type_supported(type): return True diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py index 605c2950e7..bbd92c2f18 100644 --- a/python/hsfs/feature_group.py +++ b/python/hsfs/feature_group.py @@ -2188,6 +2188,7 @@ def __init__( self._kafka_producer: Optional["confluent_kafka.Producer"] = None self._feature_writers: Optional[Dict[str, callable]] = None self._writer: Optional[callable] = None + self._kafka_headers: Optional[Dict[str, bytes]] = None def read( self, @@ -2907,6 +2908,7 @@ def finalize_multi_part_insert(self) -> None: self._kafka_producer = None self._feature_writers = None self._writer = None + self._kafka_headers = None self._multi_part_insert = False def insert_stream( diff --git a/python/tests/core/test_kafka_engine.py b/python/tests/core/test_kafka_engine.py new file mode 100644 index 0000000000..61c85da7cb --- /dev/null +++ b/python/tests/core/test_kafka_engine.py @@ -0,0 +1,527 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import importlib + +from hsfs import storage_connector +from hsfs.core import constants, kafka_engine + + +if constants.HAS_CONFLUENT_KAFKA: + from confluent_kafka.admin import PartitionMetadata, TopicMetadata + + +class TestKafkaEngine: + def test_kafka_produce(self, mocker): + # Arrange + producer = mocker.Mock() + + # Act + kafka_engine.kafka_produce( + producer=producer, + topic_name="test_topic", + headers={}, + key=None, + encoded_row=None, + acked=None, + debug_kafka=False, + ) + + # Assert + assert producer.produce.call_count == 1 + assert producer.poll.call_count == 1 + + def test_kafka_produce_buffer_error(self, mocker): + # Arrange + mocker.patch("hsfs.client.get_instance") + mock_print = mocker.patch("builtins.print") + + producer = mocker.Mock() + producer.produce.side_effect = [BufferError("test_error"), None] + # Act + kafka_engine.kafka_produce( + producer=producer, + topic_name="test_topic", + headers={}, + key=None, + encoded_row=None, + acked=None, + debug_kafka=True, + ) + + # Assert + assert producer.produce.call_count == 2 + assert producer.poll.call_count == 2 + assert mock_print.call_count == 1 + assert mock_print.call_args[0][0] == "Caught: test_error" + + def test_encode_complex_features(self): + # Arrange + def test_utf(value, bytes_io): + bytes_io.write(bytes(value, "utf-8")) + + # Act + result = kafka_engine.encode_complex_features( + feature_writers={"one": test_utf, "two": test_utf}, + row={"one": "1", "two": "2"}, + ) + + # Assert + assert len(result) == 2 + assert result == {"one": b"1", "two": b"2"} + + def test_get_encoder_func(self, mocker): + # Arrange + mock_json_loads = mocker.patch("json.loads") + mock_avro_schema_parse = mocker.patch("avro.schema.parse") + constants.HAS_AVRO = True + constants.HAS_FAST_AVRO = False + importlib.reload(kafka_engine) + + # Act + result = kafka_engine.get_encoder_func( + writer_schema='{"type" : "record",' + '"namespace" : "Tutorialspoint",' + '"name" : "Employee",' + '"fields" : [{ "name" : "Name" , "type" : "string" },' + '{ "name" : "Age" , "type" : "int" }]}' + ) + + # Assert + assert result is not None + assert mock_json_loads.call_count == 0 + assert mock_avro_schema_parse.call_count == 1 + + def test_get_encoder_func_fast(self, mocker): + # Arrange + mock_avro_schema_parse = mocker.patch("avro.schema.parse") + mock_json_loads = mocker.patch( + "json.loads", + return_value={ + "type": "record", + "namespace": "Tutorialspoint", + "name": "Employee", + "fields": [ + {"name": "Name", "type": "string"}, + {"name": "Age", "type": "int"}, + ], + }, + ) + constants.HAS_AVRO = False + constants.HAS_FAST_AVRO = True + importlib.reload(kafka_engine) + + # Act + result = kafka_engine.get_encoder_func( + writer_schema='{"type" : "record",' + '"namespace" : "Tutorialspoint",' + '"name" : "Employee",' + '"fields" : [{ "name" : "Name" , "type" : "string" },' + '{ "name" : "Age" , "type" : "int" }]}' + ) + + # Assert + assert result is not None + assert mock_json_loads.call_count == 1 + assert mock_avro_schema_parse.call_count == 0 + + def test_get_kafka_config(self, mocker, backend_fixtures): + # Arrange + mocker.patch("hsfs.engine.get_instance") + mock_storage_connector_api = mocker.patch( + "hsfs.core.storage_connector_api.StorageConnectorApi" + ) + + json = backend_fixtures["storage_connector"]["get_kafka"]["response"] + sc = storage_connector.StorageConnector.from_response_json(json) + mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc + + mocker.patch("hsfs.core.kafka_engine.isinstance", return_value=True) + + mock_client = mocker.patch("hsfs.client.get_instance") + mock_client.return_value._write_pem.return_value = ( + "test_ssl_ca_location", + "test_ssl_certificate_location", + "test_ssl_key_location", + ) + + # Act + result = kafka_engine.get_kafka_config( + 1, + write_options={ + "kafka_producer_config": {"test_name_1": "test_value_1"}, + }, + ) + + # Assert + assert ( + mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] + is False + ) + assert result == { + "bootstrap.servers": "test_bootstrap_servers", + "security.protocol": "test_security_protocol", + "ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", + "ssl.ca.location": "test_ssl_ca_location", + "ssl.certificate.location": "test_ssl_certificate_location", + "ssl.key.location": "test_ssl_key_location", + "test_name_1": "test_value_1", + } + + def test_get_kafka_config_external_client(self, mocker, backend_fixtures): + # Arrange + mocker.patch("hsfs.engine.get_instance") + mock_storage_connector_api = mocker.patch( + "hsfs.core.storage_connector_api.StorageConnectorApi" + ) + + json = backend_fixtures["storage_connector"]["get_kafka"]["response"] + sc = storage_connector.StorageConnector.from_response_json(json) + mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc + + mocker.patch("hsfs.engine.python.isinstance", return_value=False) + + mock_client = mocker.patch("hsfs.client.get_instance") + mock_client.return_value._write_pem.return_value = ( + "test_ssl_ca_location", + "test_ssl_certificate_location", + "test_ssl_key_location", + ) + + # Act + result = kafka_engine.get_kafka_config( + 1, + write_options={ + "kafka_producer_config": {"test_name_1": "test_value_1"}, + }, + ) + + # Assert + assert ( + mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] + is True + ) + assert result == { + "bootstrap.servers": "test_bootstrap_servers", + "security.protocol": "test_security_protocol", + "ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", + "ssl.ca.location": "test_ssl_ca_location", + "ssl.certificate.location": "test_ssl_certificate_location", + "ssl.key.location": "test_ssl_key_location", + "test_name_1": "test_value_1", + } + + def test_get_kafka_config_internal_kafka(self, mocker, backend_fixtures): + # Arrange + mocker.patch("hsfs.engine.get_instance") + mock_storage_connector_api = mocker.patch( + "hsfs.core.storage_connector_api.StorageConnectorApi" + ) + + json = backend_fixtures["storage_connector"]["get_kafka"]["response"] + sc = storage_connector.StorageConnector.from_response_json(json) + mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc + + mocker.patch("hsfs.engine.python.isinstance", return_value=True) + + mock_client = mocker.patch("hsfs.client.get_instance") + mock_client.return_value._write_pem.return_value = ( + "test_ssl_ca_location", + "test_ssl_certificate_location", + "test_ssl_key_location", + ) + + # Act + result = kafka_engine.get_kafka_config( + 1, + write_options={ + "kafka_producer_config": {"test_name_1": "test_value_1"}, + "internal_kafka": True, + }, + ) + + # Assert + assert ( + mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] + is False + ) + assert result == { + "bootstrap.servers": "test_bootstrap_servers", + "security.protocol": "test_security_protocol", + "ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", + "ssl.ca.location": "test_ssl_ca_location", + "ssl.certificate.location": "test_ssl_certificate_location", + "ssl.key.location": "test_ssl_key_location", + "test_name_1": "test_value_1", + } + + def test_get_kafka_config_external_client_internal_kafka( + self, mocker, backend_fixtures + ): + # Arrange + mocker.patch("hsfs.engine.get_instance") + mock_storage_connector_api = mocker.patch( + "hsfs.core.storage_connector_api.StorageConnectorApi" + ) + + json = backend_fixtures["storage_connector"]["get_kafka"]["response"] + sc = storage_connector.StorageConnector.from_response_json(json) + mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc + + mocker.patch("hsfs.engine.python.isinstance", return_value=False) + + mock_client = mocker.patch("hsfs.client.get_instance") + mock_client.return_value._write_pem.return_value = ( + "test_ssl_ca_location", + "test_ssl_certificate_location", + "test_ssl_key_location", + ) + + # Act + result = kafka_engine.get_kafka_config( + 1, + write_options={ + "kafka_producer_config": {"test_name_1": "test_value_1"}, + "internal_kafka": True, + }, + ) + + # Assert + assert ( + mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] + is False + ) + assert result == { + "bootstrap.servers": "test_bootstrap_servers", + "security.protocol": "test_security_protocol", + "ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", + "ssl.ca.location": "test_ssl_ca_location", + "ssl.certificate.location": "test_ssl_certificate_location", + "ssl.key.location": "test_ssl_key_location", + "test_name_1": "test_value_1", + } + + def test_kafka_get_offsets_high(self, mocker): + # Arrange + feature_store_id = 99 + topic_name = "test_topic" + partition_metadata = PartitionMetadata() + partition_metadata.id = 0 + topic_metadata = TopicMetadata() + topic_metadata.partitions = {partition_metadata.id: partition_metadata} + topic_mock = mocker.MagicMock() + + # return no topics and one commit, so it should start the job with the extra arg + topic_mock.topics = {topic_name: topic_metadata} + + consumer = mocker.MagicMock() + consumer.list_topics = mocker.MagicMock(return_value=topic_mock) + consumer.get_watermark_offsets = mocker.MagicMock(return_value=(0, 11)) + mocker.patch( + "hsfs.core.kafka_engine.init_kafka_consumer", + return_value=consumer, + ) + + # Act + result = kafka_engine.kafka_get_offsets( + topic_name=topic_name, + feature_store_id=feature_store_id, + offline_write_options={}, + high=True, + ) + + # Assert + assert result == f" -initialCheckPointString {topic_name},0:11" + + def test_kafka_get_offsets_low(self, mocker): + # Arrange + feature_store_id = 99 + topic_name = "test_topic" + partition_metadata = PartitionMetadata() + partition_metadata.id = 0 + topic_metadata = TopicMetadata() + topic_metadata.partitions = {partition_metadata.id: partition_metadata} + topic_mock = mocker.MagicMock() + + # return no topics and one commit, so it should start the job with the extra arg + topic_mock.topics = {topic_name: topic_metadata} + + consumer = mocker.MagicMock() + consumer.list_topics = mocker.MagicMock(return_value=topic_mock) + consumer.get_watermark_offsets = mocker.MagicMock(return_value=(0, 11)) + mocker.patch( + "hsfs.core.kafka_engine.init_kafka_consumer", + return_value=consumer, + ) + + # Act + result = kafka_engine.kafka_get_offsets( + feature_store_id=feature_store_id, + topic_name=topic_name, + offline_write_options={}, + high=False, + ) + + # Assert + assert result == f" -initialCheckPointString {topic_name},0:0" + + def test_kafka_get_offsets_no_topic(self, mocker): + # Arrange + topic_name = "test_topic" + topic_mock = mocker.MagicMock() + + # return no topics and one commit, so it should start the job with the extra arg + topic_mock.topics = {} + + consumer = mocker.MagicMock() + consumer.list_topics = mocker.MagicMock(return_value=topic_mock) + consumer.get_watermark_offsets = mocker.MagicMock(return_value=(0, 11)) + mocker.patch( + "hsfs.core.kafka_engine.init_kafka_consumer", + return_value=consumer, + ) + # Act + result = kafka_engine.kafka_get_offsets( + topic_name=topic_name, + feature_store_id=99, + offline_write_options={}, + high=True, + ) + + # Assert + assert result == "" + + def test_spark_get_kafka_config(self, mocker, backend_fixtures): + # Arrange + mocker.patch("hsfs.client.get_instance") + mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") + mock_engine_get_instance.return_value.add_file.return_value = ( + "result_from_add_file" + ) + mock_storage_connector_api = mocker.patch( + "hsfs.core.storage_connector_api.StorageConnectorApi" + ) + json = backend_fixtures["storage_connector"]["get_kafka_external"]["response"] + sc = storage_connector.StorageConnector.from_response_json(json) + mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc + + mocker.patch("hsfs.core.kafka_engine.isinstance", return_value=True) + + # Act + results = kafka_engine.get_kafka_config( + 1, write_options={"user_opt": "ABC"}, engine="spark" + ) + + # Assert + assert results == { + "kafka.bootstrap.servers": "test_bootstrap_servers", + "kafka.security.protocol": "test_security_protocol", + "kafka.ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", + "kafka.ssl.key.password": "test_ssl_key_password", + "kafka.ssl.keystore.location": "result_from_add_file", + "kafka.ssl.keystore.password": "test_ssl_keystore_password", + "kafka.ssl.truststore.location": "result_from_add_file", + "kafka.ssl.truststore.password": "test_ssl_truststore_password", + "kafka.test_option_name": "test_option_value", + "user_opt": "ABC", + } + assert ( + mock_storage_connector_api.return_value.get_kafka_connector.call_count == 1 + ) + assert ( + mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] + is False + ) + + def test_spark_get_kafka_config_external_client(self, mocker, backend_fixtures): + # Arrange + mocker.patch("hsfs.client.get_instance") + mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") + mock_engine_get_instance.return_value.add_file.return_value = ( + "result_from_add_file" + ) + mock_storage_connector_api = mocker.patch( + "hsfs.core.storage_connector_api.StorageConnectorApi" + ) + json = backend_fixtures["storage_connector"]["get_kafka_external"]["response"] + sc = storage_connector.StorageConnector.from_response_json(json) + mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc + + # Act + results = kafka_engine.get_kafka_config( + 1, write_options={"user_opt": "ABC"}, engine="spark" + ) + + # Assert + assert results == { + "kafka.bootstrap.servers": "test_bootstrap_servers", + "kafka.security.protocol": "test_security_protocol", + "kafka.ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", + "kafka.ssl.key.password": "test_ssl_key_password", + "kafka.ssl.keystore.location": "result_from_add_file", + "kafka.ssl.keystore.password": "test_ssl_keystore_password", + "kafka.ssl.truststore.location": "result_from_add_file", + "kafka.ssl.truststore.password": "test_ssl_truststore_password", + "kafka.test_option_name": "test_option_value", + "user_opt": "ABC", + } + assert ( + mock_storage_connector_api.return_value.get_kafka_connector.call_count == 1 + ) + assert ( + mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] + is True + ) + + def test_spark_get_kafka_config_internal_kafka(self, mocker, backend_fixtures): + # Arrange + mocker.patch("hsfs.client.get_instance") + mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") + mock_engine_get_instance.return_value.add_file.return_value = ( + "result_from_add_file" + ) + mock_storage_connector_api = mocker.patch( + "hsfs.core.storage_connector_api.StorageConnectorApi" + ) + json = backend_fixtures["storage_connector"]["get_kafka_external"]["response"] + sc = storage_connector.StorageConnector.from_response_json(json) + mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc + + # Act + results = kafka_engine.get_kafka_config( + 1, write_options={"user_opt": "ABC", "internal_kafka": True}, engine="spark" + ) + + # Assert + assert results == { + "kafka.bootstrap.servers": "test_bootstrap_servers", + "kafka.security.protocol": "test_security_protocol", + "kafka.ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", + "kafka.ssl.key.password": "test_ssl_key_password", + "kafka.ssl.keystore.location": "result_from_add_file", + "kafka.ssl.keystore.password": "test_ssl_keystore_password", + "kafka.ssl.truststore.location": "result_from_add_file", + "kafka.ssl.truststore.password": "test_ssl_truststore_password", + "kafka.test_option_name": "test_option_value", + "user_opt": "ABC", + "internal_kafka": True, + } + assert ( + mock_storage_connector_api.return_value.get_kafka_connector.call_count == 1 + ) + assert ( + mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] + is False + ) diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index a6740b71e6..6ec54beab8 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -21,7 +21,6 @@ import polars as pl import pyarrow as pa import pytest -from confluent_kafka.admin import PartitionMetadata, TopicMetadata from hsfs import ( feature, feature_group, @@ -2956,6 +2955,8 @@ def test_write_training_dataset_query_td(self, mocker, backend_fixtures): mock_td_api.return_value.compute.return_value = mock_job mocker.patch("hsfs.util.get_job_url") + mocker.patch("hsfs.client.get_instance") + python_engine = python.Engine() fg = feature_group.FeatureGroup.from_response_json( @@ -3304,344 +3305,18 @@ def test_get_unique_values(self): assert 2 in result assert 3 in result - def test_kafka_produce(self, mocker): - # Arrange - mocker.patch("hsfs.client.get_instance") - - python_engine = python.Engine() - - producer = mocker.Mock() - - fg = feature_group.FeatureGroup( - name="test", - version=1, - featurestore_id=99, - primary_key=[], - partition_key=[], - id=10, - stream=False, - ) - fg.feature_store = mocker.Mock() - - # Act - python_engine._kafka_produce( - producer=producer, - feature_group=fg, - key=None, - encoded_row=None, - acked=None, - offline_write_options={}, - ) - - # Assert - assert producer.produce.call_count == 1 - assert producer.poll.call_count == 1 - - def test_kafka_produce_buffer_error(self, mocker): - # Arrange - mocker.patch("hsfs.client.get_instance") - mock_print = mocker.patch("builtins.print") - - python_engine = python.Engine() - - producer = mocker.Mock() - producer.produce.side_effect = [BufferError("test_error"), None] - - fg = feature_group.FeatureGroup( - name="test", - version=1, - featurestore_id=99, - primary_key=[], - partition_key=[], - id=10, - stream=False, - ) - fg.feature_store = mocker.Mock() - - # Act - python_engine._kafka_produce( - producer=producer, - feature_group=fg, - key=None, - encoded_row=None, - acked=None, - offline_write_options={"debug_kafka": True}, - ) - - # Assert - assert producer.produce.call_count == 2 - assert producer.poll.call_count == 2 - assert mock_print.call_count == 1 - assert mock_print.call_args[0][0] == "Caught: test_error" - - def test_encode_complex_features(self): - # Arrange - python_engine = python.Engine() - - def test_utf(value, bytes_io): - bytes_io.write(bytes(value, "utf-8")) - - # Act - result = python_engine._encode_complex_features( - feature_writers={"one": test_utf, "two": test_utf}, - row={"one": "1", "two": "2"}, - ) - - # Assert - assert len(result) == 2 - assert result == {"one": b"1", "two": b"2"} - - def test_get_encoder_func(self, mocker): - # Arrange - mock_json_loads = mocker.patch("json.loads") - mock_avro_schema_parse = mocker.patch("avro.schema.parse") - - python_engine = python.Engine() - python.HAS_FAST = False - - # Act - result = python_engine._get_encoder_func( - writer_schema='{"type" : "record",' - '"namespace" : "Tutorialspoint",' - '"name" : "Employee",' - '"fields" : [{ "name" : "Name" , "type" : "string" },' - '{ "name" : "Age" , "type" : "int" }]}' - ) - - # Assert - assert result is not None - assert mock_json_loads.call_count == 0 - assert mock_avro_schema_parse.call_count == 1 - - def test_get_encoder_func_fast(self, mocker): - # Arrange - mock_json_loads = mocker.patch( - "json.loads", - return_value={ - "type": "record", - "namespace": "Tutorialspoint", - "name": "Employee", - "fields": [ - {"name": "Name", "type": "string"}, - {"name": "Age", "type": "int"}, - ], - }, - ) - mock_avro_schema_parse = mocker.patch("avro.schema.parse") - - python_engine = python.Engine() - python.HAS_FAST = True - - # Act - result = python_engine._get_encoder_func( - writer_schema='{"type" : "record",' - '"namespace" : "Tutorialspoint",' - '"name" : "Employee",' - '"fields" : [{ "name" : "Name" , "type" : "string" },' - '{ "name" : "Age" , "type" : "int" }]}' - ) - - # Assert - assert result is not None - assert mock_json_loads.call_count == 1 - assert mock_avro_schema_parse.call_count == 0 - - def test_get_kafka_config(self, mocker, backend_fixtures): - # Arrange - mocker.patch("hsfs.engine.get_instance") - mock_storage_connector_api = mocker.patch( - "hsfs.core.storage_connector_api.StorageConnectorApi" - ) - - json = backend_fixtures["storage_connector"]["get_kafka"]["response"] - sc = storage_connector.StorageConnector.from_response_json(json) - mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc - - mocker.patch("hsfs.engine.python.isinstance", return_value=True) - - mock_client = mocker.patch("hsfs.client.get_instance") - mock_client.return_value._write_pem.return_value = ( - "test_ssl_ca_location", - "test_ssl_certificate_location", - "test_ssl_key_location", - ) - - python_engine = python.Engine() - - # Act - result = python_engine._get_kafka_config( - 1, - write_options={ - "kafka_producer_config": {"test_name_1": "test_value_1"}, - }, - ) - - # Assert - assert ( - mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] - is False - ) - assert result == { - "bootstrap.servers": "test_bootstrap_servers", - "security.protocol": "test_security_protocol", - "ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", - "ssl.ca.location": "test_ssl_ca_location", - "ssl.certificate.location": "test_ssl_certificate_location", - "ssl.key.location": "test_ssl_key_location", - "test_name_1": "test_value_1", - } - - def test_get_kafka_config_external_client(self, mocker, backend_fixtures): - # Arrange - mocker.patch("hsfs.engine.get_instance") - mock_storage_connector_api = mocker.patch( - "hsfs.core.storage_connector_api.StorageConnectorApi" - ) - - json = backend_fixtures["storage_connector"]["get_kafka"]["response"] - sc = storage_connector.StorageConnector.from_response_json(json) - mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc - - mocker.patch("hsfs.engine.python.isinstance", return_value=False) - - mock_client = mocker.patch("hsfs.client.get_instance") - mock_client.return_value._write_pem.return_value = ( - "test_ssl_ca_location", - "test_ssl_certificate_location", - "test_ssl_key_location", - ) - - python_engine = python.Engine() - - # Act - result = python_engine._get_kafka_config( - 1, - write_options={ - "kafka_producer_config": {"test_name_1": "test_value_1"}, - }, - ) - - # Assert - assert ( - mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] - is True - ) - assert result == { - "bootstrap.servers": "test_bootstrap_servers", - "security.protocol": "test_security_protocol", - "ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", - "ssl.ca.location": "test_ssl_ca_location", - "ssl.certificate.location": "test_ssl_certificate_location", - "ssl.key.location": "test_ssl_key_location", - "test_name_1": "test_value_1", - } - - def test_get_kafka_config_internal_kafka(self, mocker, backend_fixtures): - # Arrange - mocker.patch("hsfs.engine.get_instance") - mock_storage_connector_api = mocker.patch( - "hsfs.core.storage_connector_api.StorageConnectorApi" - ) - - json = backend_fixtures["storage_connector"]["get_kafka"]["response"] - sc = storage_connector.StorageConnector.from_response_json(json) - mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc - - mocker.patch("hsfs.engine.python.isinstance", return_value=True) - - mock_client = mocker.patch("hsfs.client.get_instance") - mock_client.return_value._write_pem.return_value = ( - "test_ssl_ca_location", - "test_ssl_certificate_location", - "test_ssl_key_location", - ) - - python_engine = python.Engine() - - # Act - result = python_engine._get_kafka_config( - 1, - write_options={ - "kafka_producer_config": {"test_name_1": "test_value_1"}, - "internal_kafka": True, - }, - ) - - # Assert - assert ( - mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] - is False - ) - assert result == { - "bootstrap.servers": "test_bootstrap_servers", - "security.protocol": "test_security_protocol", - "ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", - "ssl.ca.location": "test_ssl_ca_location", - "ssl.certificate.location": "test_ssl_certificate_location", - "ssl.key.location": "test_ssl_key_location", - "test_name_1": "test_value_1", - } - - def test_get_kafka_config_external_client_internal_kafka( - self, mocker, backend_fixtures - ): - # Arrange - mocker.patch("hsfs.engine.get_instance") - mock_storage_connector_api = mocker.patch( - "hsfs.core.storage_connector_api.StorageConnectorApi" - ) - - json = backend_fixtures["storage_connector"]["get_kafka"]["response"] - sc = storage_connector.StorageConnector.from_response_json(json) - mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc - - mocker.patch("hsfs.engine.python.isinstance", return_value=False) - - mock_client = mocker.patch("hsfs.client.get_instance") - mock_client.return_value._write_pem.return_value = ( - "test_ssl_ca_location", - "test_ssl_certificate_location", - "test_ssl_key_location", - ) - - python_engine = python.Engine() - - # Act - result = python_engine._get_kafka_config( - 1, - write_options={ - "kafka_producer_config": {"test_name_1": "test_value_1"}, - "internal_kafka": True, - }, - ) - - # Assert - assert ( - mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] - is False - ) - assert result == { - "bootstrap.servers": "test_bootstrap_servers", - "security.protocol": "test_security_protocol", - "ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", - "ssl.ca.location": "test_ssl_ca_location", - "ssl.certificate.location": "test_ssl_certificate_location", - "ssl.key.location": "test_ssl_key_location", - "test_name_1": "test_value_1", - } - def test_materialization_kafka(self, mocker): # Arrange - mocker.patch("hsfs.engine.python.Engine._get_kafka_config", return_value={}) + mocker.patch("hsfs.core.kafka_engine.get_kafka_config", return_value={}) mocker.patch("hsfs.feature_group.FeatureGroup._get_encoded_avro_schema") - mocker.patch("hsfs.engine.python.Engine._get_encoder_func") - mocker.patch("hsfs.engine.python.Engine._encode_complex_features") + mocker.patch("hsfs.core.kafka_engine.get_encoder_func") + mocker.patch("hsfs.core.kafka_engine.encode_complex_features") mock_python_engine_kafka_produce = mocker.patch( - "hsfs.engine.python.Engine._kafka_produce" + "hsfs.core.kafka_engine.kafka_produce" ) mocker.patch("hsfs.util.get_job_url") mocker.patch( - "hsfs.engine.python.Engine._kafka_get_offsets", + "hsfs.core.kafka_engine.kafka_get_offsets", return_value=" tests_offsets", ) mocker.patch( @@ -3649,6 +3324,8 @@ def test_materialization_kafka(self, mocker): return_value=["", ""], ) + mocker.patch("hsfs.client.get_instance") + python_engine = python.Engine() fg = feature_group.FeatureGroup( @@ -3687,16 +3364,16 @@ def test_materialization_kafka(self, mocker): def test_materialization_kafka_first_job_execution(self, mocker): # Arrange - mocker.patch("hsfs.engine.python.Engine._get_kafka_config", return_value={}) + mocker.patch("hsfs.core.kafka_engine.get_kafka_config", return_value={}) mocker.patch("hsfs.feature_group.FeatureGroup._get_encoded_avro_schema") - mocker.patch("hsfs.engine.python.Engine._get_encoder_func") - mocker.patch("hsfs.engine.python.Engine._encode_complex_features") + mocker.patch("hsfs.core.kafka_engine.get_encoder_func") + mocker.patch("hsfs.core.kafka_engine.encode_complex_features") mock_python_engine_kafka_produce = mocker.patch( - "hsfs.engine.python.Engine._kafka_produce" + "hsfs.core.kafka_engine.kafka_produce" ) mocker.patch("hsfs.util.get_job_url") mocker.patch( - "hsfs.engine.python.Engine._kafka_get_offsets", + "hsfs.core.kafka_engine.kafka_get_offsets", return_value=" tests_offsets", ) mocker.patch( @@ -3704,6 +3381,8 @@ def test_materialization_kafka_first_job_execution(self, mocker): return_value=[], ) + mocker.patch("hsfs.client.get_instance") + python_engine = python.Engine() fg = feature_group.FeatureGroup( @@ -3742,19 +3421,21 @@ def test_materialization_kafka_first_job_execution(self, mocker): def test_materialization_kafka_skip_offsets(self, mocker): # Arrange - mocker.patch("hsfs.engine.python.Engine._get_kafka_config", return_value={}) + mocker.patch("hsfs.core.kafka_engine.get_kafka_config", return_value={}) mocker.patch("hsfs.feature_group.FeatureGroup._get_encoded_avro_schema") - mocker.patch("hsfs.engine.python.Engine._get_encoder_func") - mocker.patch("hsfs.engine.python.Engine._encode_complex_features") + mocker.patch("hsfs.core.kafka_engine.get_encoder_func") + mocker.patch("hsfs.core.kafka_engine.encode_complex_features") mock_python_engine_kafka_produce = mocker.patch( - "hsfs.engine.python.Engine._kafka_produce" + "hsfs.core.kafka_engine.kafka_produce" ) mocker.patch("hsfs.util.get_job_url") mocker.patch( - "hsfs.engine.python.Engine._kafka_get_offsets", + "hsfs.core.kafka_engine.kafka_get_offsets", return_value=" tests_offsets", ) + mocker.patch("hsfs.client.get_instance") + python_engine = python.Engine() fg = feature_group.FeatureGroup( @@ -3796,19 +3477,21 @@ def test_materialization_kafka_skip_offsets(self, mocker): def test_materialization_kafka_topic_doesnt_exist(self, mocker): # Arrange - mocker.patch("hsfs.engine.python.Engine._get_kafka_config", return_value={}) + mocker.patch("hsfs.core.kafka_engine.get_kafka_config", return_value={}) mocker.patch("hsfs.feature_group.FeatureGroup._get_encoded_avro_schema") - mocker.patch("hsfs.engine.python.Engine._get_encoder_func") - mocker.patch("hsfs.engine.python.Engine._encode_complex_features") + mocker.patch("hsfs.core.kafka_engine.get_encoder_func") + mocker.patch("hsfs.core.kafka_engine.encode_complex_features") mock_python_engine_kafka_produce = mocker.patch( - "hsfs.engine.python.Engine._kafka_produce" + "hsfs.core.kafka_engine.kafka_produce" ) mocker.patch("hsfs.util.get_job_url") mocker.patch( - "hsfs.engine.python.Engine._kafka_get_offsets", + "hsfs.core.kafka_engine.kafka_get_offsets", side_effect=["", " tests_offsets"], ) + mocker.patch("hsfs.client.get_instance") + python_engine = python.Engine() fg = feature_group.FeatureGroup( @@ -3845,134 +3528,6 @@ def test_materialization_kafka_topic_doesnt_exist(self, mocker): await_termination=False, ) - def test_kafka_get_offsets_high(self, mocker): - # Arrange - topic_name = "test_topic" - partition_metadata = PartitionMetadata() - partition_metadata.id = 0 - topic_metadata = TopicMetadata() - topic_metadata.partitions = {partition_metadata.id: partition_metadata} - topic_mock = mocker.MagicMock() - - # return no topics and one commit, so it should start the job with the extra arg - topic_mock.topics = {topic_name: topic_metadata} - - consumer = mocker.MagicMock() - consumer.list_topics = mocker.MagicMock(return_value=topic_mock) - consumer.get_watermark_offsets = mocker.MagicMock(return_value=(0, 11)) - mocker.patch( - "hsfs.engine.python.Engine._init_kafka_consumer", - return_value=consumer, - ) - - python_engine = python.Engine() - - fg = feature_group.FeatureGroup( - name="test", - version=1, - featurestore_id=99, - primary_key=[], - partition_key=[], - id=10, - stream=False, - time_travel_format="HUDI", - ) - fg._online_topic_name = topic_name - - # Act - result = python_engine._kafka_get_offsets( - feature_group=fg, - offline_write_options={}, - high=True, - ) - - # Assert - assert result == f" -initialCheckPointString {topic_name},0:11" - - def test_kafka_get_offsets_low(self, mocker): - # Arrange - topic_name = "test_topic" - partition_metadata = PartitionMetadata() - partition_metadata.id = 0 - topic_metadata = TopicMetadata() - topic_metadata.partitions = {partition_metadata.id: partition_metadata} - topic_mock = mocker.MagicMock() - - # return no topics and one commit, so it should start the job with the extra arg - topic_mock.topics = {topic_name: topic_metadata} - - consumer = mocker.MagicMock() - consumer.list_topics = mocker.MagicMock(return_value=topic_mock) - consumer.get_watermark_offsets = mocker.MagicMock(return_value=(0, 11)) - mocker.patch( - "hsfs.engine.python.Engine._init_kafka_consumer", - return_value=consumer, - ) - - python_engine = python.Engine() - - fg = feature_group.FeatureGroup( - name="test", - version=1, - featurestore_id=99, - primary_key=[], - partition_key=[], - id=10, - stream=False, - time_travel_format="HUDI", - ) - fg._online_topic_name = topic_name - - # Act - result = python_engine._kafka_get_offsets( - feature_group=fg, - offline_write_options={}, - high=False, - ) - - # Assert - assert result == f" -initialCheckPointString {topic_name},0:0" - - def test_kafka_get_offsets_no_topic(self, mocker): - # Arrange - topic_name = "test_topic" - topic_mock = mocker.MagicMock() - - # return no topics and one commit, so it should start the job with the extra arg - topic_mock.topics = {} - - consumer = mocker.MagicMock() - consumer.list_topics = mocker.MagicMock(return_value=topic_mock) - consumer.get_watermark_offsets = mocker.MagicMock(return_value=(0, 11)) - mocker.patch( - "hsfs.engine.python.Engine._init_kafka_consumer", - return_value=consumer, - ) - - python_engine = python.Engine() - - fg = feature_group.FeatureGroup( - name="test", - version=1, - featurestore_id=99, - primary_key=[], - partition_key=[], - id=10, - stream=False, - time_travel_format="HUDI", - ) - fg._online_topic_name = topic_name - - # Act - result = python_engine._kafka_get_offsets( - feature_group=fg, - offline_write_options={}, - high=True, - ) - - # Assert - assert result == "" - def test_test(self, mocker): fg = feature_group.FeatureGroup( name="test", diff --git a/python/tests/engine/test_python_writer.py b/python/tests/engine/test_python_writer.py index 0dd0533156..2c6a8fd3d1 100644 --- a/python/tests/engine/test_python_writer.py +++ b/python/tests/engine/test_python_writer.py @@ -27,7 +27,8 @@ class TestPythonWriter: def test_write_dataframe_kafka(self, mocker, dataframe_fixture_times): # Arrange - mocker.patch("hsfs.engine.python.Engine._get_kafka_config", return_value={}) + mocker.patch("hsfs.client.get_instance") + mocker.patch("hsfs.core.kafka_engine.get_kafka_config", return_value={}) avro_schema_mock = mocker.patch( "hsfs.feature_group.FeatureGroup._get_encoded_avro_schema" ) @@ -45,7 +46,7 @@ def test_write_dataframe_kafka(self, mocker, dataframe_fixture_times): ) avro_schema_mock.side_effect = [avro_schema] mock_python_engine_kafka_produce = mocker.patch( - "hsfs.engine.python.Engine._kafka_produce" + "hsfs.core.kafka_engine.kafka_produce" ) mocker.patch("hsfs.core.job_api.JobApi") # get, launch mocker.patch("hsfs.util.get_job_url") @@ -55,11 +56,7 @@ def test_write_dataframe_kafka(self, mocker, dataframe_fixture_times): topic_mock.topics = {topic_name: topic_metadata} consumer = mocker.MagicMock() consumer.list_topics = mocker.MagicMock(return_value=topic_mock) - mocker.patch( - "hsfs.engine.python.Consumer", - return_value=consumer, - ) - mocker.patch("hsfs.engine.python.Producer") + mocker.patch("hsfs.core.kafka_engine.Consumer", return_value=consumer) python_engine = python.Engine() fg = feature_group.FeatureGroup( @@ -83,7 +80,8 @@ def test_write_dataframe_kafka(self, mocker, dataframe_fixture_times): ) # Assert - encoded_row = mock_python_engine_kafka_produce.call_args[0][3] + print(mock_python_engine_kafka_produce.call_args) + encoded_row = mock_python_engine_kafka_produce.call_args[1]["encoded_row"] print("Value" + str(encoded_row)) parsed_schema = fastavro.parse_schema(json.loads(avro_schema)) with BytesIO() as outf: diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py index 8b98f521c6..5e7699b534 100644 --- a/python/tests/engine/test_spark.py +++ b/python/tests/engine/test_spark.py @@ -4487,187 +4487,3 @@ def test_create_empty_df(self): # Assert assert result.schema == spark_df.schema assert result.collect() == [] - - def test_get_kafka_config(self, mocker, backend_fixtures): - # Arrange - mocker.patch("hsfs.engine.get_type") - mocker.patch("hsfs.client.get_instance") - mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") - mock_engine_get_instance.return_value.add_file.return_value = ( - "result_from_add_file" - ) - - mocker.patch("hsfs.engine.spark.isinstance", return_value=True) - - mock_storage_connector_api = mocker.patch( - "hsfs.core.storage_connector_api.StorageConnectorApi" - ) - json = backend_fixtures["storage_connector"]["get_kafka_external"]["response"] - sc = storage_connector.StorageConnector.from_response_json(json) - mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc - - spark_engine = spark.Engine() - - # Act - results = spark_engine._get_kafka_config(1, write_options={"user_opt": "ABC"}) - - # Assert - assert results == { - "kafka.bootstrap.servers": "test_bootstrap_servers", - "kafka.security.protocol": "test_security_protocol", - "kafka.ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", - "kafka.ssl.key.password": "test_ssl_key_password", - "kafka.ssl.keystore.location": "result_from_add_file", - "kafka.ssl.keystore.password": "test_ssl_keystore_password", - "kafka.ssl.truststore.location": "result_from_add_file", - "kafka.ssl.truststore.password": "test_ssl_truststore_password", - "kafka.test_option_name": "test_option_value", - "user_opt": "ABC", - } - assert ( - mock_storage_connector_api.return_value.get_kafka_connector.call_count == 1 - ) - assert ( - mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] - is False - ) - - def test_get_kafka_config_external_client(self, mocker, backend_fixtures): - # Arrange - mocker.patch("hsfs.engine.get_type") - mocker.patch("hsfs.client.get_instance") - mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") - mock_engine_get_instance.return_value.add_file.return_value = ( - "result_from_add_file" - ) - - mocker.patch("hsfs.engine.spark.isinstance", return_value=False) - - mock_storage_connector_api = mocker.patch( - "hsfs.core.storage_connector_api.StorageConnectorApi" - ) - json = backend_fixtures["storage_connector"]["get_kafka_external"]["response"] - sc = storage_connector.StorageConnector.from_response_json(json) - mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc - - spark_engine = spark.Engine() - - # Act - results = spark_engine._get_kafka_config(1, write_options={"user_opt": "ABC"}) - - # Assert - assert results == { - "kafka.bootstrap.servers": "test_bootstrap_servers", - "kafka.security.protocol": "test_security_protocol", - "kafka.ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", - "kafka.ssl.key.password": "test_ssl_key_password", - "kafka.ssl.keystore.location": "result_from_add_file", - "kafka.ssl.keystore.password": "test_ssl_keystore_password", - "kafka.ssl.truststore.location": "result_from_add_file", - "kafka.ssl.truststore.password": "test_ssl_truststore_password", - "kafka.test_option_name": "test_option_value", - "user_opt": "ABC", - } - assert ( - mock_storage_connector_api.return_value.get_kafka_connector.call_count == 1 - ) - assert ( - mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] - is True - ) - - def test_get_kafka_config_internal_kafka(self, mocker, backend_fixtures): - # Arrange - mocker.patch("hsfs.engine.get_type") - mocker.patch("hsfs.client.get_instance") - mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") - mock_engine_get_instance.return_value.add_file.return_value = ( - "result_from_add_file" - ) - - mocker.patch("hsfs.engine.spark.isinstance", return_value=True) - - mock_storage_connector_api = mocker.patch( - "hsfs.core.storage_connector_api.StorageConnectorApi" - ) - json = backend_fixtures["storage_connector"]["get_kafka_external"]["response"] - sc = storage_connector.StorageConnector.from_response_json(json) - mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc - - spark_engine = spark.Engine() - - # Act - results = spark_engine._get_kafka_config( - 1, write_options={"user_opt": "ABC", "internal_kafka": True} - ) - - # Assert - assert results == { - "kafka.bootstrap.servers": "test_bootstrap_servers", - "kafka.security.protocol": "test_security_protocol", - "kafka.ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", - "kafka.ssl.key.password": "test_ssl_key_password", - "kafka.ssl.keystore.location": "result_from_add_file", - "kafka.ssl.keystore.password": "test_ssl_keystore_password", - "kafka.ssl.truststore.location": "result_from_add_file", - "kafka.ssl.truststore.password": "test_ssl_truststore_password", - "kafka.test_option_name": "test_option_value", - "user_opt": "ABC", - "internal_kafka": True, - } - assert ( - mock_storage_connector_api.return_value.get_kafka_connector.call_count == 1 - ) - assert ( - mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] - is False - ) - - def test_get_kafka_config_external_client_internal_kafka( - self, mocker, backend_fixtures - ): - # Arrange - mocker.patch("hsfs.engine.get_type") - mocker.patch("hsfs.client.get_instance") - mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") - mock_engine_get_instance.return_value.add_file.return_value = ( - "result_from_add_file" - ) - - mocker.patch("hsfs.engine.spark.isinstance", return_value=False) - - mock_storage_connector_api = mocker.patch( - "hsfs.core.storage_connector_api.StorageConnectorApi" - ) - json = backend_fixtures["storage_connector"]["get_kafka_external"]["response"] - sc = storage_connector.StorageConnector.from_response_json(json) - mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc - - spark_engine = spark.Engine() - - # Act - results = spark_engine._get_kafka_config( - 1, write_options={"user_opt": "ABC", "internal_kafka": True} - ) - - # Assert - assert results == { - "kafka.bootstrap.servers": "test_bootstrap_servers", - "kafka.security.protocol": "test_security_protocol", - "kafka.ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", - "kafka.ssl.key.password": "test_ssl_key_password", - "kafka.ssl.keystore.location": "result_from_add_file", - "kafka.ssl.keystore.password": "test_ssl_keystore_password", - "kafka.ssl.truststore.location": "result_from_add_file", - "kafka.ssl.truststore.password": "test_ssl_truststore_password", - "kafka.test_option_name": "test_option_value", - "user_opt": "ABC", - "internal_kafka": True, - } - assert ( - mock_storage_connector_api.return_value.get_kafka_connector.call_count == 1 - ) - assert ( - mock_storage_connector_api.return_value.get_kafka_connector.call_args[0][1] - is False - ) diff --git a/python/tests/test_feature_group_writer.py b/python/tests/test_feature_group_writer.py index f4d980a99d..861022e5c9 100644 --- a/python/tests/test_feature_group_writer.py +++ b/python/tests/test_feature_group_writer.py @@ -56,11 +56,16 @@ def test_fg_writer_cache_management(self, mocker, dataframe_fixture_basic): mocker.MagicMock(), mocker.MagicMock(), ) + headers = { + "projectId": str(99).encode("utf8"), + "featureGroupId": str(32).encode("utf8"), + "subjectId": str(12).encode("utf8"), + } mock_init_kafka_resources = mocker.patch( - "hsfs.engine.python.Engine._init_kafka_resources", - return_value=(producer, feature_writers, writer_m), + "hsfs.core.kafka_engine._init_kafka_resources", + return_value=(producer, headers, feature_writers, writer_m), ) - mocker.patch("hsfs.engine.python.Engine._encode_complex_features") + mocker.patch("hsfs.core.kafka_engine.encode_complex_features") mocker.patch("hsfs.core.job.Job") mocker.patch("hsfs.engine.get_type", return_value="python") @@ -86,6 +91,7 @@ def test_fg_writer_cache_management(self, mocker, dataframe_fixture_basic): assert writer._feature_group._kafka_producer == producer assert writer._feature_group._feature_writers == feature_writers assert writer._feature_group._writer == writer_m + assert writer._feature_group._kafka_headers == headers writer.insert(dataframe_fixture_basic) # after second insert should have been called only once @@ -96,6 +102,7 @@ def test_fg_writer_cache_management(self, mocker, dataframe_fixture_basic): assert fg._multi_part_insert is False assert fg._kafka_producer is None assert fg._feature_writers is None + assert fg._kafka_headers is None assert fg._writer is None def test_fg_writer_without_context_manager(self, mocker, dataframe_fixture_basic): @@ -107,11 +114,16 @@ def test_fg_writer_without_context_manager(self, mocker, dataframe_fixture_basic mocker.MagicMock(), mocker.MagicMock(), ) + headers = { + "projectId": str(99).encode("utf8"), + "featureGroupId": str(32).encode("utf8"), + "subjectId": str(12).encode("utf8"), + } mock_init_kafka_resources = mocker.patch( - "hsfs.engine.python.Engine._init_kafka_resources", - return_value=(producer, feature_writers, writer_m), + "hsfs.core.kafka_engine._init_kafka_resources", + return_value=(producer, headers, feature_writers, writer_m), ) - mocker.patch("hsfs.engine.python.Engine._encode_complex_features") + mocker.patch("hsfs.core.kafka_engine.encode_complex_features") mocker.patch("hsfs.core.job.Job") mocker.patch("hsfs.engine.get_type", return_value="python") @@ -133,6 +145,7 @@ def test_fg_writer_without_context_manager(self, mocker, dataframe_fixture_basic assert fg._multi_part_insert is True assert fg._kafka_producer == producer assert fg._feature_writers == feature_writers + assert fg._kafka_headers == headers assert fg._writer == writer_m fg.multi_part_insert(dataframe_fixture_basic) @@ -145,4 +158,5 @@ def test_fg_writer_without_context_manager(self, mocker, dataframe_fixture_basic assert fg._multi_part_insert is False assert fg._kafka_producer is None assert fg._feature_writers is None + assert fg._kafka_headers is None assert fg._writer is None From cf4c3e0c10ee2d5e486c3a3ddf232086dc4bd320 Mon Sep 17 00:00:00 2001 From: Aleksey Veresov Date: Mon, 8 Jul 2024 15:17:36 +0200 Subject: [PATCH 10/11] Fix mistype of _project_id in engine/python (#1364) --- python/hsfs/engine/python.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index 5e2b9699e8..a755c58862 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -1235,7 +1235,7 @@ def _write_dataframe_kafka( producer, headers, feature_writers, writer = kafka_engine.init_kafka_resources( feature_group, offline_write_options, - project_id=client.get_instance().project_id, + project_id=client.get_instance()._project_id, ) if not feature_group._multi_part_insert: # set initial_check_point to the current offset From 70dd4355313545471de1015467f3847db5c5fa48 Mon Sep 17 00:00:00 2001 From: kennethmhc Date: Mon, 8 Jul 2024 16:14:47 +0200 Subject: [PATCH 11/11] [APPEND][FSTORE-1424] Feature logging fix fv logging_enabled (#1366) --- python/hsfs/feature_view.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 20e7237465..d28b53da15 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -3426,7 +3426,7 @@ def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureView": "training_helper_columns", "schema", "serving_keys", - "enabled_logging", + "logging_enabled", ]: self._update_attribute_if_present(self, other, key) self._init_feature_monitoring_engine()