Skip to content

Commit

Permalink
added check schema wrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
MatsMoll committed Mar 2, 2024
1 parent 4a483a2 commit 70e5530
Show file tree
Hide file tree
Showing 14 changed files with 254 additions and 174 deletions.
6 changes: 2 additions & 4 deletions aligned/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@
from aligned.data_source.stream_data_source import HttpStreamSource
from aligned.data_source.batch_data_source import CustomMethodDataSource
from aligned.feature_store import FeatureStore
from aligned.feature_view import (
feature_view,
combined_feature_view,
)
from aligned.feature_view import feature_view, combined_feature_view, check_schema
from aligned.schemas.text_vectoriser import EmbeddingModel
from aligned.sources.kafka import KafkaConfig
from aligned.sources.local import FileSource
Expand Down Expand Up @@ -68,4 +65,5 @@
# Schemas
'FeatureLocation',
'FeatureInputVersions',
'check_schema',
]
13 changes: 13 additions & 0 deletions aligned/data_source/batch_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,19 @@ def features_for(self, facts: RetrivalJob, request: RetrivalRequest) -> Retrival
request=request, method=lambda: dill.loads(self.features_for_method)(facts, request)
)

@classmethod
def multi_source_features_for(
cls: type[T], facts: RetrivalJob, requests: list[tuple[T, RetrivalRequest]]
) -> RetrivalJob:

if len(requests) != 1:
raise NotImplementedError(
f'Type: {cls} have not implemented how to load fact data with multiple sources.'
)

source, request = requests[0]
return source.features_for(facts, request) # type: ignore

@staticmethod
def from_methods(
all_data: Callable[[RetrivalRequest, int | None], Coroutine[None, None, pl.LazyFrame]] | None = None,
Expand Down
3 changes: 2 additions & 1 deletion aligned/feature_view/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from aligned.feature_view.combined_view import (
combined_feature_view,
)
from aligned.feature_view.feature_view import feature_view
from aligned.feature_view.feature_view import feature_view, check_schema

__all__ = [
'feature_view',
'combined_feature_view',
'check_schema',
]
80 changes: 80 additions & 0 deletions aligned/feature_view/feature_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -702,3 +702,83 @@ class MyView:
{feature_code}
"""


def check_schema() -> Callable:
"""
A wrapper that checks the schema of data frames given a feature view or model contract.
```python
@feature_view(...)
class MyView:
id = Int32().as_entity()
name = String()
@check_schema()
def my_function(data: Annotated[pd.DataFrame, MyView]):
...
# Will raise an error since the name column is missing
my_function(pd.DataFrame({
"id": [1, 2, 3],
})
```
"""

def decorator(func: Callable) -> Callable:
def func_wrapper(*args, **kwargs) -> Any:
from typing import _AnnotatedAlias # type: ignore

params_to_check = {
name: value for name, value in func.__annotations__.items() if type(value) == _AnnotatedAlias
}

function_args = func.__code__.co_varnames

# Naming args variables
all_args = kwargs.copy()
for index in range(len(args)):
all_args[function_args[index]] = args[index]

def wrapper_metadata(value: Any) -> FeatureViewWrapper | None:
for val in value.__metadata__:
if isinstance(val, FeatureViewWrapper):
return val
return None

for key, value in params_to_check.items():
missing_columns = set()

value = wrapper_metadata(value)
if value is None:
continue

if key not in all_args:
raise ValueError(f"Unable to find {key}")

view = value.compile()
df = all_args[key]

if isinstance(df, (pl.LazyFrame, pl.DataFrame, pd.DataFrame)):
columns = df.columns
elif isinstance(df, dict):
columns = list(df.keys())
else:
raise ValueError(f'Invalid data type: {type(df)}')

for feature in view.request_all.needed_requests[0].all_features:
if feature.name not in columns:
missing_columns.add(feature.name)

if missing_columns:
raise ValueError(
f"Missing columns: {list(missing_columns)} in the dataframe '{key}'\n{df}."
)

return func(*args, **kwargs)

return func_wrapper

return decorator
48 changes: 48 additions & 0 deletions aligned/feature_view/tests/test_check_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pytest
from aligned import Bool, Float, String, feature_view, FileSource
from aligned.feature_view.feature_view import check_schema
from typing import Annotated
import pandas as pd


@feature_view(
name='test',
source=FileSource.parquet_at('test.parquet'),
)
class TestView:

id = String().as_entity()

a = String()
b = Bool()
c = Float()


@check_schema()
def some_method(df: Annotated[pd.DataFrame, TestView]) -> pd.DataFrame:
return df


def test_check_schema() -> None:

df = pd.DataFrame(
{'id': ['a', 'b', 'c'], 'a': ['a', 'b', 'c'], 'b': [True, False, True], 'c': [1.0, 2.0, 3.0]}
)

res = some_method(df)

assert df.equals(res)


def test_check_schema_error() -> None:

df = pd.DataFrame(
{
'id': ['a', 'b', 'c'],
'a': ['a', 'b', 'c'],
'b': [True, False, True],
}
)

with pytest.raises(ValueError): # noqa: PT011
some_method(df)
64 changes: 2 additions & 62 deletions aligned/schemas/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

NAME_POLARS_MAPPING = {
'string': pl.Utf8,
'int8': pl.Int8,
'int16': pl.Int16,
'int32': pl.Int32,
'int64': pl.Int64,
'float': pl.Float64,
Expand All @@ -27,68 +29,6 @@
}


# @dataclass
# class SupportedTypes(Codable):

# string: String | None = field(default=None)

# def dtype(self) -> DataTypeInterface:
# values = [self.string]
# for value in values:
# if value:
# return value
# raise ValueError("Found no data type, the config could be corrupt.")


# @dataclass
# class DataTypeInterface(Codable):

# @property
# def python_type(self) -> type:
# raise NotImplementedError()

# @property
# def pandas_type(self) -> str | type:
# raise NotImplementedError()

# @property
# def polars_type(self) -> pl.DataType:
# raise NotImplementedError()

# @dataclass
# class String(DataTypeInterface):

# @property
# def python_type(self) -> type:
# return str

# @property
# def pandas_type(self) -> str | type:
# return str

# @property
# def polars_type(self) -> pl.DataType:
# return pl.Utf8()


# @dataclass
# class List(DataTypeInterface):

# inner_type: DataTypeInterface

# @property
# def python_type(self) -> type:
# return list

# @property
# def pandas_type(self) -> str | type:
# return str

# @property
# def polars_type(self) -> pl.DataType:
# return pl.List(self.inner_type.polars_type)


@dataclass
class FeatureType(Codable):
# FIXME: Should use a more Pythonic design, as this one did not behave as intended
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "aligned"
version = "0.0.72"
version = "0.0.73"
description = "A data managment and lineage tool for ML applications."
authors = ["Mats E. Mollestad <mats@mollestad.no>"]
license = "Apache-2.0"
Expand Down Expand Up @@ -67,7 +67,7 @@ prometheus-fastapi-instrumentator = { version="^5.9.1", optional = true }
# sentence-transformers = { version = "^2.2.2", optional = true }
kafka-python = { version= "^2.0.2", optional = true }
connectorx = { version = "^0.3.2", optional = true }
asyncpg = {version = "^0.29.0", optional = true}
asyncpg = { version = "^0.29.0", optional = true }

[tool.poetry.extras]
aws = ["aioaws", "connectorx"]
Expand Down
Binary file modified test_data/credit_history_mater.parquet
Binary file not shown.
2 changes: 1 addition & 1 deletion test_data/feature-store.json

Large diffs are not rendered by default.

Binary file modified test_data/test_model.parquet
Binary file not shown.
2 changes: 1 addition & 1 deletion test_data/titanic-sets.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}, {"name": "optional"}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}, {"name": "optional"}]}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []}
{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}, {"name": "in_domain", "values": ["male", "female"]}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}, {"name": "optional"}]}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []}
42 changes: 21 additions & 21 deletions test_data/titanic-test.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
survived,is_mr,age,sex,sibsp,is_male,cabin,passenger_id,name,has_siblings,is_female
False,True,22.0,male,0,True,,61,"Sirayanian, Mr. Orsen",False,False
True,False,38.0,female,0,False,B28,62,"Icard, Miss. Amelie",False,True
False,True,45.0,male,1,True,C83,63,"Harris, Mr. Henry Birkhardt",True,False
False,False,4.0,male,3,True,,64,"Skoog, Master. Harald",True,False
False,True,,male,0,True,,65,"Stewart, Mr. Albert A",False,False
True,False,,male,1,True,,66,"Moubarek, Master. Gerios",True,False
True,True,29.0,female,0,False,F33,67,"Nye, Mrs. (Elizabeth Ramell)",False,True
False,True,19.0,male,0,True,,68,"Crease, Mr. Ernest James",False,False
True,False,17.0,female,4,False,,69,"Andersson, Miss. Erna Alexandra",True,True
False,True,26.0,male,2,True,,70,"Kink, Mr. Vincenz",True,False
False,True,32.0,male,0,True,,71,"Jenkin, Mr. Stephen Curnow",False,False
False,False,16.0,female,5,False,,72,"Goodwin, Miss. Lillian Amy",True,True
False,True,21.0,male,0,True,,73,"Hood, Mr. Ambrose Jr",False,False
False,True,26.0,male,1,True,,74,"Chronopoulos, Mr. Apostolos",True,False
True,True,32.0,male,0,True,,75,"Bing, Mr. Lee",False,False
False,True,25.0,male,0,True,F G73,76,"Moen, Mr. Sigurd Hansen",False,False
False,True,,male,0,True,,77,"Staneff, Mr. Ivan",False,False
False,True,,male,0,True,,78,"Moutal, Mr. Rahamin Haim",False,False
True,False,0.83,male,0,True,,79,"Caldwell, Master. Alden Gates",False,False
True,False,30.0,female,0,False,,80,"Dowdell, Miss. Elizabeth",False,True
has_siblings,name,sex,cabin,sibsp,age,is_female,is_male,is_mr,survived,passenger_id
False,"Sirayanian, Mr. Orsen",male,,0,22.0,False,True,True,False,61
False,"Icard, Miss. Amelie",female,B28,0,38.0,True,False,False,True,62
True,"Harris, Mr. Henry Birkhardt",male,C83,1,45.0,False,True,True,False,63
True,"Skoog, Master. Harald",male,,3,4.0,False,True,False,False,64
False,"Stewart, Mr. Albert A",male,,0,,False,True,True,False,65
True,"Moubarek, Master. Gerios",male,,1,,False,True,False,True,66
False,"Nye, Mrs. (Elizabeth Ramell)",female,F33,0,29.0,True,False,True,True,67
False,"Crease, Mr. Ernest James",male,,0,19.0,False,True,True,False,68
True,"Andersson, Miss. Erna Alexandra",female,,4,17.0,True,False,False,True,69
True,"Kink, Mr. Vincenz",male,,2,26.0,False,True,True,False,70
False,"Jenkin, Mr. Stephen Curnow",male,,0,32.0,False,True,True,False,71
True,"Goodwin, Miss. Lillian Amy",female,,5,16.0,True,False,False,False,72
False,"Hood, Mr. Ambrose Jr",male,,0,21.0,False,True,True,False,73
True,"Chronopoulos, Mr. Apostolos",male,,1,26.0,False,True,True,False,74
False,"Bing, Mr. Lee",male,,0,32.0,False,True,True,True,75
False,"Moen, Mr. Sigurd Hansen",male,F G73,0,25.0,False,True,True,False,76
False,"Staneff, Mr. Ivan",male,,0,,False,True,True,False,77
False,"Moutal, Mr. Rahamin Haim",male,,0,,False,True,True,False,78
False,"Caldwell, Master. Alden Gates",male,,0,0.83,False,True,False,True,79
False,"Dowdell, Miss. Elizabeth",female,,0,30.0,True,False,False,True,80
Loading

0 comments on commit 70e5530

Please sign in to comment.