From fca10e2eb5ed89d05b85b54e3754b8b6867fcf68 Mon Sep 17 00:00:00 2001 From: "Mats E. Mollestad" Date: Sat, 9 Dec 2023 18:15:39 +0100 Subject: [PATCH] feat: added some conveniance methods for feature_wrappers --- aligned/compiler/feature_factory.py | 28 ++++++- aligned/data_source/batch_data_source.py | 4 +- aligned/feature_view/feature_view.py | 35 ++++++++ .../feature_view/tests/test_joined_source.py | 63 +++++++++++++++ aligned/retrival_job.py | 76 +++++++++++++++--- aligned/schemas/feature.py | 24 +++++- aligned/tests/test_feature_view_wrapper.py | 49 +++++++++-- aligned/validation/interface.py | 8 +- aligned/validation/pandera.py | 7 +- pyproject.toml | 7 +- test_data/feature-store.json | 2 +- test_data/test_model.parquet | Bin 624 -> 624 bytes 12 files changed, 272 insertions(+), 31 deletions(-) diff --git a/aligned/compiler/feature_factory.py b/aligned/compiler/feature_factory.py index c12b257..b2656ae 100644 --- a/aligned/compiler/feature_factory.py +++ b/aligned/compiler/feature_factory.py @@ -642,10 +642,10 @@ def __floordiv__(self, other: FeatureFactory | Any) -> Float: feature.transformation = RatioFactory(self, LiteralValue.from_value(other)) return feature - def __abs__(self) -> Float: + def __abs__(self) -> Int64: from aligned.compiler.transformation_factory import AbsoluteFactory - feature = Float() + feature = Int64() feature.transformation = AbsoluteFactory(self) return feature @@ -850,6 +850,30 @@ def aggregate(self) -> ArithmeticAggregation: return ArithmeticAggregation(self) +class Int8(ArithmeticFeature, CouldBeEntityFeature, CouldBeModelVersion): + def copy_type(self) -> Int8: + return Int8() + + @property + def dtype(self) -> FeatureType: + return FeatureType.int8() + + def aggregate(self) -> ArithmeticAggregation: + return ArithmeticAggregation(self) + + +class Int16(ArithmeticFeature, CouldBeEntityFeature, CouldBeModelVersion): + def copy_type(self) -> Int16: + return Int16() + + @property + def dtype(self) -> FeatureType: + return FeatureType.int16() + + def aggregate(self) -> ArithmeticAggregation: + return ArithmeticAggregation(self) + + class Int32(ArithmeticFeature, CouldBeEntityFeature, CouldBeModelVersion): def copy_type(self) -> Int32: return Int32() diff --git a/aligned/data_source/batch_data_source.py b/aligned/data_source/batch_data_source.py index c6d38dd..8929063 100644 --- a/aligned/data_source/batch_data_source.py +++ b/aligned/data_source/batch_data_source.py @@ -180,7 +180,9 @@ def multi_source_features_for( source, _ = requests[0] if isinstance(source, BatchSourceModification): - return source.wrap_job(type(source.source).multi_source_features_for(facts, requests)) + return source.wrap_job( + type(source.source).multi_source_features_for(facts, requests) # type: ignore + ) elif isinstance(source, DataFileReference): from aligned.local.job import FileFactualJob diff --git a/aligned/feature_view/feature_view.py b/aligned/feature_view/feature_view.py index f446238..994ac3e 100644 --- a/aligned/feature_view/feature_view.py +++ b/aligned/feature_view/feature_view.py @@ -2,6 +2,8 @@ import copy import logging +import polars as pl +import pandas as pd from abc import ABC, abstractproperty from dataclasses import dataclass, field @@ -24,6 +26,7 @@ resolve_keys, ) from aligned.data_source.stream_data_source import StreamDataSource +from aligned.retrival_job import ConvertableToRetrivalJob, RetrivalJob from aligned.schemas.derivied_feature import ( AggregatedFeature, ) @@ -34,10 +37,13 @@ if TYPE_CHECKING: from aligned.feature_store import FeatureViewStore from datetime import datetime + from aligned.validation.interface import Validator # Enables code compleation in the select method T = TypeVar('T') +ConvertableData = TypeVar('ConvertableData', dict, pl.DataFrame, pd.DataFrame) + logger = logging.getLogger(__name__) @@ -330,6 +336,35 @@ class MyView: compiled = self.compile() return await FeatureView.freshness_in_source(compiled, compiled.source) + def from_data(self, data: ConvertableToRetrivalJob) -> RetrivalJob: + request = self.compile().request_all + return RetrivalJob.from_convertable(data, request) + + def drop_invalid(self, data: ConvertableData, validator: Validator | None = None) -> ConvertableData: + from aligned.retrival_job import DropInvalidJob + + if not validator: + from aligned.validation.pandera import PanderaValidator + + validator = PanderaValidator() + + features = list(DropInvalidJob.features_to_validate(self.compile().request_all.needed_requests)) + + if isinstance(data, dict): + validate_data = pd.DataFrame(data) + else: + validate_data = data + + if isinstance(validate_data, pl.DataFrame): + return validator.validate_polars(features, validate_data.lazy()).collect() + elif isinstance(validate_data, pd.DataFrame): + validated = validator.validate_pandas(features, validate_data) + if isinstance(data, dict): + return validated.to_dict(orient='list') + return validated # type: ignore + else: + raise ValueError(f'Invalid data type: {type(data)}') + class FeatureView(ABC): """ diff --git a/aligned/feature_view/tests/test_joined_source.py b/aligned/feature_view/tests/test_joined_source.py index e69de29..3f92408 100644 --- a/aligned/feature_view/tests/test_joined_source.py +++ b/aligned/feature_view/tests/test_joined_source.py @@ -0,0 +1,63 @@ +import pytest +from aligned import feature_view, Int32, FileSource +import polars as pl + + +@feature_view(name='left', source=FileSource.csv_at('some_file.csv')) +class LeftData: + + some_id = Int32().as_entity() + + feature = Int32() + + +@feature_view(name='right', source=FileSource.csv_at('some_file.csv')) +class RightData: + + some_id = Int32().as_entity() + + other_feature = Int32() + + +@pytest.mark.asyncio +async def test_join_different_types_polars() -> None: + + left_data = LeftData.from_data( # type: ignore + pl.DataFrame( + {'some_id': [1, 2, 3], 'feature': [2, 3, 4]}, schema={'some_id': pl.Int8, 'feature': pl.Int32} + ) + ) + + right_data = RightData.from_data( # type: ignore + pl.DataFrame( + {'some_id': [1, 3, 2], 'other_feature': [3, 4, 5]}, + schema={'some_id': pl.Int16, 'other_feature': pl.Int32}, + ) + ) + + expected_df = pl.DataFrame( + data={'some_id': [1, 2, 3], 'feature': [2, 3, 4], 'other_feature': [3, 5, 4]}, + schema={ + 'some_id': pl.Int32, + 'feature': pl.Int32, + 'other_feature': pl.Int32, + }, + ) + + new_data = left_data.join(right_data, 'inner', left_on='some_id', right_on='some_id') + result = await new_data.to_polars() + + joined = result.collect().sort('some_id', descending=False) + assert joined.frame_equal(expected_df.select(joined.columns)) + + +@pytest.mark.asyncio +async def test_unique_entities() -> None: + + left_data = LeftData.from_data( # type: ignore + pl.DataFrame( + {'some_id': [1, 3, 3], 'feature': [2, 3, 4]}, schema={'some_id': pl.Int8, 'feature': pl.Int32} + ) + ) + + left_data.unique_entities() diff --git a/aligned/retrival_job.py b/aligned/retrival_job.py index c1ffa1b..c9ec96c 100644 --- a/aligned/retrival_job.py +++ b/aligned/retrival_job.py @@ -371,7 +371,10 @@ def derive_features(self, requests: list[RetrivalRequest] | None = None) -> Retr def combined_features(self, requests: list[RetrivalRequest] | None = None) -> RetrivalJob: return CombineFactualJob([self], requests or self.retrival_requests) - def ensure_types(self, requests: list[RetrivalRequest]) -> RetrivalJob: + def ensure_types(self, requests: list[RetrivalRequest] | None = None) -> RetrivalJob: + if not requests: + requests = self.retrival_requests + return EnsureTypesJob(job=self, requests=requests) def select_columns(self, include_features: set[str]) -> RetrivalJob: @@ -392,6 +395,14 @@ def update_vector_index(self, indexes: list[VectorIndex]) -> RetrivalJob: def validate_entites(self) -> RetrivalJob: return ValidateEntitiesJob(self) + def unique_on(self, unique_on: list[str], sort_key: str | None = None) -> RetrivalJob: + return UniqueRowsJob(job=self, unique_on=unique_on, sort_key=sort_key) + + def unique_entities(self) -> RetrivalJob: + request = self.request_result + + return self.unique_on(unique_on=request.entity_columns, sort_key=request.event_timestamp) + def fill_missing_columns(self) -> RetrivalJob: return FillMissingColumnsJob(self) @@ -585,6 +596,29 @@ async def to_polars(self) -> pl.LazyFrame: left = await self.left_job.to_polars() right = await self.right_job.to_polars() + return_request = self.left_job.request_result + + # Need to ensure that the data types are the same. Otherwise will the join fail + for left_col, right_col in zip(self.left_on, self.right_on): + polars_type = [ + feature + for feature in return_request.features.union(return_request.entities) + if feature.name == left_col + ] + if not polars_type: + raise ValueError(f'Unable to find {left_col} in left request {return_request}.') + + polars_type = polars_type[0].dtype.polars_type + + left_column_dtypes = dict(zip(left.columns, left.dtypes)) + right_column_dtypes = dict(zip(right.columns, right.dtypes)) + + if not left_column_dtypes[left_col].is_(polars_type): + left = left.with_columns(pl.col(left_col).cast(polars_type)) + + if not right_column_dtypes[right_col].is_(polars_type): + right = right.with_columns(pl.col(right_col).cast(polars_type)) + return left.join(right, left_on=self.left_on, right_on=self.right_on, how=self.method) def log_each_job(self) -> RetrivalJob: @@ -816,20 +850,21 @@ def request_result(self) -> RequestResult: def retrival_requests(self) -> list[RetrivalRequest]: return self.job.retrival_requests - @property - def features_to_validate(self) -> set[Feature]: - return RequestResult.from_request_list( - [request for request in self.retrival_requests if not request.aggregated_features] - ).features + @staticmethod + def features_to_validate(retrival_requests: list[RetrivalRequest]) -> set[Feature]: + result = RequestResult.from_request_list( + [request for request in retrival_requests if not request.aggregated_features] + ) + return result.features.union(result.entities) async def to_pandas(self) -> pd.DataFrame: - return await self.validator.validate_pandas( - list(self.features_to_validate), await self.job.to_pandas() + return self.validator.validate_pandas( + list(DropInvalidJob.features_to_validate(self.retrival_requests)), await self.job.to_pandas() ) async def to_polars(self) -> pl.LazyFrame: - return await self.validator.validate_polars( - list(self.features_to_validate), await self.job.to_polars() + return self.validator.validate_polars( + list(DropInvalidJob.features_to_validate(self.retrival_requests)), await self.job.to_polars() ) def with_subfeatures(self) -> RetrivalJob: @@ -918,6 +953,25 @@ def remove_derived_features(self) -> RetrivalJob: return self.job.remove_derived_features() +@dataclass +class UniqueRowsJob(RetrivalJob, ModificationJob): + + job: RetrivalJob + unique_on: list[str] + sort_key: str | None = field(default=None) + + async def to_pandas(self) -> pd.DataFrame: + return (await self.to_polars()).collect().to_pandas() + + async def to_polars(self) -> pl.LazyFrame: + data = await self.job.to_polars() + + if self.sort_key: + data = data.sort(self.sort_key, descending=True) + + return data.unique(self.unique_on, keep='first').lazy() + + @dataclass class ValidateEntitiesJob(RetrivalJob, ModificationJob): @@ -932,7 +986,7 @@ async def to_pandas(self) -> pd.DataFrame: return data - async def to_polars(self) -> pl.DataFrame: + async def to_polars(self) -> pl.LazyFrame: data = await self.job.to_polars() for request in self.retrival_requests: diff --git a/aligned/schemas/feature.py b/aligned/schemas/feature.py index 5ac55d1..59c68f6 100644 --- a/aligned/schemas/feature.py +++ b/aligned/schemas/feature.py @@ -96,7 +96,15 @@ class FeatureType(Codable): @property def is_numeric(self) -> bool: - return self.name in {'bool', 'int32', 'int64', 'float', 'double'} # Can be represented as an int + return self.name in { + 'bool', + 'int8', + 'int16', + 'int32', + 'int64', + 'float', + 'double', + } # Can be represented as an int @property def python_type(self) -> type: @@ -107,6 +115,8 @@ def python_type(self) -> type: return { 'string': str, + 'int8': int, + 'int16': int, 'int32': int, 'int64': int, 'float': float, @@ -127,6 +137,8 @@ def pandas_type(self) -> str | type: return { 'string': str, + 'int8': 'Int8', + 'int16': 'Int16', 'int32': 'Int32', 'int64': 'Int64', 'float': np.float64, @@ -149,6 +161,8 @@ def polars_type(self) -> type: def feature_factory(self) -> ff.FeatureFactory: return { 'string': ff.String(), + 'int8': ff.Int8(), + 'int16': ff.Int16(), 'int32': ff.Int32(), 'int64': ff.Int64(), 'float': ff.Float(), @@ -186,6 +200,14 @@ def from_polars(polars_type: pl.DataType) -> FeatureType: def string() -> FeatureType: return FeatureType(name='string') + @staticmethod + def int8() -> FeatureType: + return FeatureType(name='int8') + + @staticmethod + def int16() -> FeatureType: + return FeatureType(name='int16') + @staticmethod def int32() -> FeatureType: return FeatureType(name='int32') diff --git a/aligned/tests/test_feature_view_wrapper.py b/aligned/tests/test_feature_view_wrapper.py index bca7196..09d135f 100644 --- a/aligned/tests/test_feature_view_wrapper.py +++ b/aligned/tests/test_feature_view_wrapper.py @@ -1,14 +1,28 @@ +# type: ignore +import pytest from aligned import feature_view, String, Int32, FileSource from aligned.schemas.feature import FeatureLocation -def test_feature_view_wrapper_feature_references() -> None: - @feature_view(name='test', source=FileSource.csv_at('some_file.csv')) - class Test: +@feature_view(name='test', source=FileSource.csv_at('some_file.csv')) +class Test: + + some_id = Int32().as_entity() + + feature = String() + + +@feature_view(name='test', source=FileSource.csv_at('some_file.csv')) +class TestDerived: - some_id = Int32().as_entity() + some_id = Int32().as_entity() - feature = String() + feature = String() + + contains_hello = feature.contains('Hello') + + +def test_feature_view_wrapper_feature_references() -> None: NewTest = Test.filter('new_test', where=lambda view: view.feature == 'test') # type: ignore @@ -18,3 +32,28 @@ class Test: assert new_test.feature._location == FeatureLocation.feature_view('new_test') assert test.feature._location == FeatureLocation.feature_view('test') assert new_test.some_id._location != test.some_id._location + + +@pytest.mark.asyncio +async def test_feature_view_wrapper_from_data() -> None: + + test_job = Test.from_data({'some_id': [10, 2, 4], 'feature': ['Hello', 'Test', 'World']}) + + result = await test_job.to_pandas() + assert result.shape[0] == 3 + assert result.shape[1] == 2 + + test_job = TestDerived.from_data({'some_id': [10, 2, 4], 'feature': ['Hello', 'Test', 'World']}) + + result = await test_job.to_pandas() + assert result.shape[0] == 3 + assert result.shape[1] == 2 + + result = await test_job.derive_features().to_pandas() + assert result.shape[0] == 3 + assert result.shape[1] == 3 + + test_invalid_result = Test.drop_invalid({'some_id': ['hello', 10, 2], 'feature': ['Hello', 'test', 2]}) + + # Returns two as the int can be casted to a str, but a str can not be casted to int + assert len(test_invalid_result['some_id']) == 2 diff --git a/aligned/validation/interface.py b/aligned/validation/interface.py index 6cae348..54df4d1 100644 --- a/aligned/validation/interface.py +++ b/aligned/validation/interface.py @@ -5,8 +5,8 @@ class Validator: - async def validate_pandas(self, features: list[Feature], df: pd.DataFrame) -> pd.DataFrame: - pass + def validate_pandas(self, features: list[Feature], df: pd.DataFrame) -> pd.DataFrame: + raise NotImplementedError(type(self)) - async def validate_polars(self, features: list[Feature], df: pl.LazyFrame) -> pl.LazyFrame: - pass + def validate_polars(self, features: list[Feature], df: pl.LazyFrame) -> pl.LazyFrame: + raise NotImplementedError(type(self)) diff --git a/aligned/validation/pandera.py b/aligned/validation/pandera.py index e1bbd9b..a66f30f 100644 --- a/aligned/validation/pandera.py +++ b/aligned/validation/pandera.py @@ -37,7 +37,6 @@ class PanderaValidator(Validator): } def _column_for(self, feature: Feature) -> Column: - Check.str_matches if feature.constraints is None: return Column( @@ -66,10 +65,10 @@ def _build_schema(self, features: list[Feature]) -> DataFrameSchema: columns={feature.name: self._column_for(feature) for feature in features}, drop_invalid_rows=True ) - async def validate_pandas(self, features: list[Feature], df: pd.DataFrame) -> pd.DataFrame: + def validate_pandas(self, features: list[Feature], df: pd.DataFrame) -> pd.DataFrame: schema = self._build_schema(features) return schema.validate(df, lazy=True) - async def validate_polars(self, features: list[Feature], df: pl.LazyFrame) -> pl.LazyFrame: + def validate_polars(self, features: list[Feature], df: pl.LazyFrame) -> pl.LazyFrame: input_df = df.collect().to_pandas() - return pl.from_pandas(await self.validate_pandas(features, input_df)).lazy() + return pl.from_pandas(self.validate_pandas(features, input_df)).lazy() diff --git a/pyproject.toml b/pyproject.toml index a9e1ce1..0801639 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "aligned" -version = "0.0.49" -description = "A scalable feature store that makes it easy to align offline and online ML systems" +version = "0.0.50" +description = "A data managment and lineage tool for ML applications." authors = ["Mats E. Mollestad "] license = "Apache-2.0" readme = "README.md" @@ -17,6 +17,9 @@ keywords = [ 'feature-store', 'feast', 'tecton', + 'dbt', + 'data', + 'lineage' ] classifiers = [ 'Development Status :: 3 - Alpha', diff --git a/test_data/feature-store.json b/test_data/feature-store.json index e8f6f3e..cafdfe0 100644 --- a/test_data/feature-store.json +++ b/test_data/feature-store.json @@ -1 +1 @@ -{"metadata": {"created_at": "2023-12-04T07:33:52.227841", "name": "feature_store_location.py", "github_url": null}, "feature_views": [{"name": "titanic", "tags": {}, "source": {"mapping_keys": {"PassengerId": "passenger_id", "Age": "age", "Sex": "sex", "Survived": "survived", "SibSp": "sibsp", "UpdatedAt": "updated_at"}, "type_name": "csv", "path": "test_data/titanic_scd_data.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "updated_at", "dtype": {"name": "datetime"}, "description": null, "tags": null, "constraints": null}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}]}], "derived_features": [{"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "word_vectoriser", "dtype": {"name": "embedding"}, "key": "name", "model": {"name": "gensim", "model_name": "glove-wiki-gigaword-50", "config": {"to_lowercase": false, "deaccent": false, "encoding": "utf8", "errors": "strict"}, "loaded_model": null}}, "depth": 1}, {"name": "square_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul", "dtype": {"name": "float"}, "front": "sibsp", "behind": "sibsp"}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "double_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul_val", "dtype": {"name": "float"}, "key": "sibsp", "value": {"name": "int", "value": 2}}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": {"name": "updated_at", "ttl": null, "description": null, "tags": null, "dtype": {"name": "datetime"}}, "stream_data_source": {"mapping_keys": {}, "name": "redis", "topic_name": "titanic_stream", "config": {"env_var": "REDIS_URL"}, "record_coder": {"coder_type": "json", "key": "json"}}, "application_source": null, "materialized_source": null, "event_triggers": null, "contacts": null, "indexes": [{"location": {"name": "titanic", "location": "feature_view"}, "vector": {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null}, "vector_dim": 50, "metadata": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "storage": {"type_name": "redis", "config": {"env_var": "REDIS_URL"}, "name": "name_embedding_index", "initial_cap": 10000, "distance_metric": "COSINE", "index_alogrithm": "FLAT", "embedding_type": "FLOAT32"}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}]}]}, {"name": "titanic_parquet", "tags": {}, "source": {"mapping_keys": {}, "type_name": "parquet", "path": "test_data/titanic.parquet", "config": {"engine": "auto", "compression": "snappy", "should_write_index": false}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}]}], "derived_features": [{"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": null, "stream_data_source": null, "application_source": null, "materialized_source": null, "event_triggers": null, "contacts": null, "indexes": []}], "combined_feature_views": [], "models": [{"name": "titanic", "features": [{"name": "age", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "float"}}, {"name": "is_male", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, {"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}, {"name": "has_siblings", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}], "predictions_view": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "probability", "dtype": {"name": "float"}, "description": "The probability of target named will_survive being 'True'.", "tags": null, "constraints": null}], "derived_features": [{"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "probability", "location": {"name": "titanic", "location": "model"}, "dtype": {"name": "float"}}], "transformation": {"name": "map_arg_max", "dtype": {"name": "bool"}, "column_mappings": {"probability": {"name": "bool", "value": true}}}, "depth": 1}], "model_version_column": null, "event_timestamp": null, "source": null, "application_source": null, "stream_source": null, "regression_targets": [], "classification_targets": [{"estimating": {"name": "survived", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, "feature": {"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null}, "on_ground_truth_event": null, "event_trigger": null, "class_probabilities": [{"outcome": {"name": "bool", "value": true}, "feature": {"name": "probability", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null}}], "confidence": null}]}, "description": "A model predicting if a passenger will survive", "contacts": null, "tags": null, "dataset_folder": null}], "enrichers": []} +{"metadata": {"created_at": "2023-12-09T15:15:58.428750", "name": "feature_store_location.py", "github_url": null}, "feature_views": [{"name": "titanic_parquet", "tags": {}, "source": {"mapping_keys": {}, "type_name": "parquet", "path": "test_data/titanic.parquet", "config": {"engine": "auto", "compression": "snappy", "should_write_index": false}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}], "derived_features": [{"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": null, "stream_data_source": null, "application_source": null, "materialized_source": null, "event_triggers": null, "contacts": null, "indexes": []}, {"name": "titanic", "tags": {}, "source": {"mapping_keys": {"PassengerId": "passenger_id", "Age": "age", "Sex": "sex", "Survived": "survived", "SibSp": "sibsp", "UpdatedAt": "updated_at"}, "type_name": "csv", "path": "test_data/titanic_scd_data.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "updated_at", "dtype": {"name": "datetime"}, "description": null, "tags": null, "constraints": null}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}], "derived_features": [{"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "square_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul", "dtype": {"name": "float"}, "front": "sibsp", "behind": "sibsp"}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "word_vectoriser", "dtype": {"name": "embedding"}, "key": "name", "model": {"name": "gensim", "model_name": "glove-wiki-gigaword-50", "config": {"to_lowercase": false, "deaccent": false, "encoding": "utf8", "errors": "strict"}, "loaded_model": null}}, "depth": 1}, {"name": "double_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul_val", "dtype": {"name": "float"}, "key": "sibsp", "value": {"name": "int", "value": 2}}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": {"name": "updated_at", "ttl": null, "description": null, "tags": null, "dtype": {"name": "datetime"}}, "stream_data_source": {"mapping_keys": {}, "name": "redis", "topic_name": "titanic_stream", "config": {"env_var": "REDIS_URL"}, "record_coder": {"coder_type": "json", "key": "json"}}, "application_source": null, "materialized_source": null, "event_triggers": null, "contacts": null, "indexes": [{"location": {"name": "titanic", "location": "feature_view"}, "vector": {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null}, "vector_dim": 50, "metadata": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "storage": {"type_name": "redis", "config": {"env_var": "REDIS_URL"}, "name": "name_embedding_index", "initial_cap": 10000, "distance_metric": "COSINE", "index_alogrithm": "FLAT", "embedding_type": "FLOAT32"}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}]}]}], "combined_feature_views": [], "models": [{"name": "titanic", "features": [{"name": "has_siblings", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, {"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}, {"name": "age", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "float"}}, {"name": "is_male", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}], "predictions_view": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "probability", "dtype": {"name": "float"}, "description": "The probability of target named will_survive being 'True'.", "tags": null, "constraints": null}], "derived_features": [{"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "probability", "location": {"name": "titanic", "location": "model"}, "dtype": {"name": "float"}}], "transformation": {"name": "map_arg_max", "dtype": {"name": "bool"}, "column_mappings": {"probability": {"name": "bool", "value": true}}}, "depth": 1}], "model_version_column": null, "event_timestamp": null, "source": null, "application_source": null, "stream_source": null, "regression_targets": [], "classification_targets": [{"estimating": {"name": "survived", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, "feature": {"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null}, "on_ground_truth_event": null, "event_trigger": null, "class_probabilities": [{"outcome": {"name": "bool", "value": true}, "feature": {"name": "probability", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null}}], "confidence": null}]}, "description": "A model predicting if a passenger will survive", "contacts": null, "tags": null, "dataset_folder": null}], "enrichers": []} diff --git a/test_data/test_model.parquet b/test_data/test_model.parquet index 0dcf1ec2f6fc3ca99d1cfc498f05a1fbe2387eb8..4069aa24ce9b2e3145aeebc400bc344021883c30 100644 GIT binary patch delta 53 tcmeys@_}W7oG=RmCnJLhC%Xd!9|Ht%O*Axv2s49(nV`aq8>_xC0szEg2E_mX delta 53 scmeys@_}W7oG>2)CnEzFC%Xd!3j+j*Of)nTW&#N_A`3Hbtop_X0K^vt#Q*>R