From 70e5530a79ef2f7b07cb5405cfb1b7bd4d7a5a66 Mon Sep 17 00:00:00 2001 From: "Mats E. Mollestad" Date: Sat, 2 Mar 2024 11:39:24 +0100 Subject: [PATCH] added check schema wrapper --- aligned/__init__.py | 6 +- aligned/data_source/batch_data_source.py | 13 ++ aligned/feature_view/__init__.py | 3 +- aligned/feature_view/feature_view.py | 80 ++++++++++++ .../feature_view/tests/test_check_schema.py | 48 +++++++ aligned/schemas/feature.py | 64 +-------- pyproject.toml | 4 +- test_data/credit_history_mater.parquet | Bin 972 -> 977 bytes test_data/feature-store.json | 2 +- test_data/test_model.parquet | Bin 586 -> 590 bytes test_data/titanic-sets.json | 2 +- test_data/titanic-test.csv | 42 +++--- test_data/titanic-train.csv | 122 +++++++++--------- test_data/titanic-validate.csv | 42 +++--- 14 files changed, 254 insertions(+), 174 deletions(-) create mode 100644 aligned/feature_view/tests/test_check_schema.py diff --git a/aligned/__init__.py b/aligned/__init__.py index ab5724b..a03b9c6 100644 --- a/aligned/__init__.py +++ b/aligned/__init__.py @@ -17,10 +17,7 @@ from aligned.data_source.stream_data_source import HttpStreamSource from aligned.data_source.batch_data_source import CustomMethodDataSource from aligned.feature_store import FeatureStore -from aligned.feature_view import ( - feature_view, - combined_feature_view, -) +from aligned.feature_view import feature_view, combined_feature_view, check_schema from aligned.schemas.text_vectoriser import EmbeddingModel from aligned.sources.kafka import KafkaConfig from aligned.sources.local import FileSource @@ -68,4 +65,5 @@ # Schemas 'FeatureLocation', 'FeatureInputVersions', + 'check_schema', ] diff --git a/aligned/data_source/batch_data_source.py b/aligned/data_source/batch_data_source.py index 40d402c..dc71ddc 100644 --- a/aligned/data_source/batch_data_source.py +++ b/aligned/data_source/batch_data_source.py @@ -321,6 +321,19 @@ def features_for(self, facts: RetrivalJob, request: RetrivalRequest) -> Retrival request=request, method=lambda: dill.loads(self.features_for_method)(facts, request) ) + @classmethod + def multi_source_features_for( + cls: type[T], facts: RetrivalJob, requests: list[tuple[T, RetrivalRequest]] + ) -> RetrivalJob: + + if len(requests) != 1: + raise NotImplementedError( + f'Type: {cls} have not implemented how to load fact data with multiple sources.' + ) + + source, request = requests[0] + return source.features_for(facts, request) # type: ignore + @staticmethod def from_methods( all_data: Callable[[RetrivalRequest, int | None], Coroutine[None, None, pl.LazyFrame]] | None = None, diff --git a/aligned/feature_view/__init__.py b/aligned/feature_view/__init__.py index cb9f1bb..eeb9db7 100644 --- a/aligned/feature_view/__init__.py +++ b/aligned/feature_view/__init__.py @@ -1,9 +1,10 @@ from aligned.feature_view.combined_view import ( combined_feature_view, ) -from aligned.feature_view.feature_view import feature_view +from aligned.feature_view.feature_view import feature_view, check_schema __all__ = [ 'feature_view', 'combined_feature_view', + 'check_schema', ] diff --git a/aligned/feature_view/feature_view.py b/aligned/feature_view/feature_view.py index 234e42f..fa8c0ed 100644 --- a/aligned/feature_view/feature_view.py +++ b/aligned/feature_view/feature_view.py @@ -702,3 +702,83 @@ class MyView: {feature_code} """ + + +def check_schema() -> Callable: + """ + A wrapper that checks the schema of data frames given a feature view or model contract. + + + ```python + @feature_view(...) + class MyView: + id = Int32().as_entity() + name = String() + + @check_schema() + def my_function(data: Annotated[pd.DataFrame, MyView]): + ... + + + # Will raise an error since the name column is missing + my_function(pd.DataFrame({ + "id": [1, 2, 3], + }) + ``` + """ + + def decorator(func: Callable) -> Callable: + def func_wrapper(*args, **kwargs) -> Any: + from typing import _AnnotatedAlias # type: ignore + + params_to_check = { + name: value for name, value in func.__annotations__.items() if type(value) == _AnnotatedAlias + } + + function_args = func.__code__.co_varnames + + # Naming args variables + all_args = kwargs.copy() + for index in range(len(args)): + all_args[function_args[index]] = args[index] + + def wrapper_metadata(value: Any) -> FeatureViewWrapper | None: + for val in value.__metadata__: + if isinstance(val, FeatureViewWrapper): + return val + return None + + for key, value in params_to_check.items(): + missing_columns = set() + + value = wrapper_metadata(value) + if value is None: + continue + + if key not in all_args: + raise ValueError(f"Unable to find {key}") + + view = value.compile() + df = all_args[key] + + if isinstance(df, (pl.LazyFrame, pl.DataFrame, pd.DataFrame)): + columns = df.columns + elif isinstance(df, dict): + columns = list(df.keys()) + else: + raise ValueError(f'Invalid data type: {type(df)}') + + for feature in view.request_all.needed_requests[0].all_features: + if feature.name not in columns: + missing_columns.add(feature.name) + + if missing_columns: + raise ValueError( + f"Missing columns: {list(missing_columns)} in the dataframe '{key}'\n{df}." + ) + + return func(*args, **kwargs) + + return func_wrapper + + return decorator diff --git a/aligned/feature_view/tests/test_check_schema.py b/aligned/feature_view/tests/test_check_schema.py new file mode 100644 index 0000000..9109aee --- /dev/null +++ b/aligned/feature_view/tests/test_check_schema.py @@ -0,0 +1,48 @@ +import pytest +from aligned import Bool, Float, String, feature_view, FileSource +from aligned.feature_view.feature_view import check_schema +from typing import Annotated +import pandas as pd + + +@feature_view( + name='test', + source=FileSource.parquet_at('test.parquet'), +) +class TestView: + + id = String().as_entity() + + a = String() + b = Bool() + c = Float() + + +@check_schema() +def some_method(df: Annotated[pd.DataFrame, TestView]) -> pd.DataFrame: + return df + + +def test_check_schema() -> None: + + df = pd.DataFrame( + {'id': ['a', 'b', 'c'], 'a': ['a', 'b', 'c'], 'b': [True, False, True], 'c': [1.0, 2.0, 3.0]} + ) + + res = some_method(df) + + assert df.equals(res) + + +def test_check_schema_error() -> None: + + df = pd.DataFrame( + { + 'id': ['a', 'b', 'c'], + 'a': ['a', 'b', 'c'], + 'b': [True, False, True], + } + ) + + with pytest.raises(ValueError): # noqa: PT011 + some_method(df) diff --git a/aligned/schemas/feature.py b/aligned/schemas/feature.py index a842aff..3a912b8 100644 --- a/aligned/schemas/feature.py +++ b/aligned/schemas/feature.py @@ -11,6 +11,8 @@ NAME_POLARS_MAPPING = { 'string': pl.Utf8, + 'int8': pl.Int8, + 'int16': pl.Int16, 'int32': pl.Int32, 'int64': pl.Int64, 'float': pl.Float64, @@ -27,68 +29,6 @@ } -# @dataclass -# class SupportedTypes(Codable): - -# string: String | None = field(default=None) - -# def dtype(self) -> DataTypeInterface: -# values = [self.string] -# for value in values: -# if value: -# return value -# raise ValueError("Found no data type, the config could be corrupt.") - - -# @dataclass -# class DataTypeInterface(Codable): - -# @property -# def python_type(self) -> type: -# raise NotImplementedError() - -# @property -# def pandas_type(self) -> str | type: -# raise NotImplementedError() - -# @property -# def polars_type(self) -> pl.DataType: -# raise NotImplementedError() - -# @dataclass -# class String(DataTypeInterface): - -# @property -# def python_type(self) -> type: -# return str - -# @property -# def pandas_type(self) -> str | type: -# return str - -# @property -# def polars_type(self) -> pl.DataType: -# return pl.Utf8() - - -# @dataclass -# class List(DataTypeInterface): - -# inner_type: DataTypeInterface - -# @property -# def python_type(self) -> type: -# return list - -# @property -# def pandas_type(self) -> str | type: -# return str - -# @property -# def polars_type(self) -> pl.DataType: -# return pl.List(self.inner_type.polars_type) - - @dataclass class FeatureType(Codable): # FIXME: Should use a more Pythonic design, as this one did not behave as intended diff --git a/pyproject.toml b/pyproject.toml index abf2ea2..143724c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "aligned" -version = "0.0.72" +version = "0.0.73" description = "A data managment and lineage tool for ML applications." authors = ["Mats E. Mollestad "] license = "Apache-2.0" @@ -67,7 +67,7 @@ prometheus-fastapi-instrumentator = { version="^5.9.1", optional = true } # sentence-transformers = { version = "^2.2.2", optional = true } kafka-python = { version= "^2.0.2", optional = true } connectorx = { version = "^0.3.2", optional = true } -asyncpg = {version = "^0.29.0", optional = true} +asyncpg = { version = "^0.29.0", optional = true } [tool.poetry.extras] aws = ["aioaws", "connectorx"] diff --git a/test_data/credit_history_mater.parquet b/test_data/credit_history_mater.parquet index 5f6b68c27d4fe4ddfdea48af6eabb076b038e8c6..b6416ab16d6925c73d0612ee15c59c62bed6a620 100644 GIT binary patch delta 406 zcmXX@F;Buk6uvu}iiGv5ja?ybfFwo|9Uc~23<9mD1qPK$0AsZYm|$339cgZHbawI| zxH+JUyFbF2ICu|z!+T%8?|t{Z8yLOjw1>hLK!RZ1l`6Cno%xz4Rlm*b1%W#5hyLcFO ztT5l&st+%&-KgKOy|7iR7Xrr|qoIad``Dxl%as>OcV!?ma5{fCm`f2L*OaD-gOZI@ a$0}XQ_onjjJ+iJgqp&*~lNlofKjRlMVQO~( delta 419 zcmcb}eukYTz%j^h;@$`*4u**=@l1OdC#Hn?G%zzr%7}4@CW$dHfLMDNMLQUw>;`7h z1Q1h{MbgKEr6@nYM3jv|luMOKLbSM~G$l2!Bt9oUF)u!)G&MdqH8F3pE2BQA7@MSw zq>WmZ%*1!<+#C#I8(2g|7}Sn1O`gdp!?TBxLF@>Ns04%B1LnyG7}b~>m?z6HsdD~c z(g%SG&&gg)E^MKWKoB;03X^}mYdMhV?pGNV9#oLxW|6l53`W?7XIZs2Gh z0Mu3qQskHpBHTfQqbtaSKu0G}M<+1t>gWs>2di^*45$KXvTy}Ufz1ypi_EdCN;Wcq zm{$c>3^FepW}pjx1B*Zg0_})&bV_%0b#wyR?^KoI5tbblZfX!=R8bZgZkiV55tb8S bWSA2fZsN!w!4{C8lUP*D(9Oia0E!s^c&l$H diff --git a/test_data/feature-store.json b/test_data/feature-store.json index 102e24e..9665338 100644 --- a/test_data/feature-store.json +++ b/test_data/feature-store.json @@ -1 +1 @@ -{"metadata": {"created_at": "2024-02-23T21:42:36.674269", "name": "feature_store_location.py", "repo_url": null, "github_url": null}, "feature_views": [{"name": "titanic", "tags": {}, "source": {"mapping_keys": {"PassengerId": "passenger_id", "Age": "age", "Sex": "sex", "Survived": "survived", "SibSp": "sibsp", "UpdatedAt": "updated_at"}, "type_name": "csv", "path": "test_data/titanic_scd_data.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "updated_at", "dtype": {"name": "datetime"}, "description": null, "tags": null, "constraints": null}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}], "derived_features": [{"name": "double_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul_val", "dtype": {"name": "float"}, "key": "sibsp", "value": {"name": "int", "value": 2}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "word_vectoriser", "dtype": {"name": "embedding"}, "key": "name", "model": {"name": "gensim", "model_name": "glove-wiki-gigaword-50", "config": {"to_lowercase": false, "deaccent": false, "encoding": "utf8", "errors": "strict"}, "loaded_model": null}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "square_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul", "dtype": {"name": "float"}, "front": "sibsp", "behind": "sibsp"}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": {"name": "updated_at", "ttl": null, "description": null, "tags": null, "dtype": {"name": "datetime"}}, "stream_data_source": {"mapping_keys": {}, "name": "redis", "topic_name": "titanic_stream", "config": {"env_var": "REDIS_URL"}, "record_coder": {"coder_type": "json", "key": "json"}}, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": [{"location": {"name": "titanic", "location": "feature_view"}, "vector": {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null}, "vector_dim": 50, "metadata": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "storage": {"type_name": "redis", "config": {"env_var": "REDIS_URL"}, "name": "name_embedding_index", "initial_cap": 10000, "distance_metric": "COSINE", "index_alogrithm": "FLAT", "embedding_type": "FLOAT32"}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}]}]}, {"name": "titanic_parquet", "tags": {}, "source": {"mapping_keys": {}, "type_name": "parquet", "path": "test_data/titanic.parquet", "config": {"engine": "auto", "compression": "snappy", "should_write_index": false}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}], "derived_features": [{"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": null, "stream_data_source": null, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": []}], "combined_feature_views": [], "models": [{"name": "titanic", "features": {"default_version": "default", "versions": {"default": [{"name": "age", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "float"}}, {"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}, {"name": "has_siblings", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, {"name": "is_male", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}]}}, "predictions_view": {"entities": [], "features": [{"name": "probability", "dtype": {"name": "float"}, "description": "The probability of target named will_survive being 'True'.", "tags": null, "constraints": null}], "derived_features": [{"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "probability", "location": {"name": "titanic", "location": "model"}, "dtype": {"name": "float"}}], "transformation": {"name": "map_arg_max", "dtype": {"name": "bool"}, "column_mappings": {"probability": {"name": "bool", "value": true}}}, "depth": 1}], "model_version_column": null, "event_timestamp": null, "source": null, "application_source": null, "stream_source": null, "regression_targets": [], "classification_targets": [{"estimating": {"name": "survived", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, "feature": {"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null}, "on_ground_truth_event": null, "event_trigger": null, "class_probabilities": [{"outcome": {"name": "bool", "value": true}, "feature": {"name": "probability", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null}}], "confidence": null}], "recommendation_targets": [], "acceptable_freshness": 86400.0, "unacceptable_freshness": 172800.0}, "description": "A model predicting if a passenger will survive", "contacts": null, "tags": null, "dataset_store": null, "exposed_at_url": null}], "enrichers": []} +{"metadata": {"created_at": "2024-03-02T10:34:30.080269", "name": "feature_store_location.py", "repo_url": null, "github_url": null}, "feature_views": [{"name": "titanic_parquet", "tags": {}, "source": {"mapping_keys": {}, "type_name": "parquet", "path": "test_data/titanic.parquet", "config": {"engine": "auto", "compression": "snappy", "should_write_index": false}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}], "derived_features": [{"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": null, "stream_data_source": null, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": []}, {"name": "titanic", "tags": {}, "source": {"mapping_keys": {"PassengerId": "passenger_id", "Age": "age", "Sex": "sex", "Survived": "survived", "SibSp": "sibsp", "UpdatedAt": "updated_at"}, "type_name": "csv", "path": "test_data/titanic_scd_data.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "updated_at", "dtype": {"name": "datetime"}, "description": null, "tags": null, "constraints": null}], "derived_features": [{"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "word_vectoriser", "dtype": {"name": "embedding"}, "key": "name", "model": {"name": "gensim", "model_name": "glove-wiki-gigaword-50", "config": {"to_lowercase": false, "deaccent": false, "encoding": "utf8", "errors": "strict"}, "loaded_model": null}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "square_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul", "dtype": {"name": "float"}, "front": "sibsp", "behind": "sibsp"}, "depth": 1}, {"name": "double_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul_val", "dtype": {"name": "float"}, "key": "sibsp", "value": {"name": "int", "value": 2}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": {"name": "updated_at", "ttl": null, "description": null, "tags": null, "dtype": {"name": "datetime"}}, "stream_data_source": {"mapping_keys": {}, "name": "redis", "topic_name": "titanic_stream", "config": {"env_var": "REDIS_URL"}, "record_coder": {"coder_type": "json", "key": "json"}}, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": [{"location": {"name": "titanic", "location": "feature_view"}, "vector": {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null}, "vector_dim": 50, "metadata": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "storage": {"type_name": "redis", "config": {"env_var": "REDIS_URL"}, "name": "name_embedding_index", "initial_cap": 10000, "distance_metric": "COSINE", "index_alogrithm": "FLAT", "embedding_type": "FLOAT32"}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}]}]}], "combined_feature_views": [], "models": [{"name": "titanic", "features": {"default_version": "default", "versions": {"default": [{"name": "age", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "float"}}, {"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}, {"name": "has_siblings", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, {"name": "is_male", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}]}}, "predictions_view": {"entities": [], "features": [{"name": "probability", "dtype": {"name": "float"}, "description": "The probability of target named will_survive being 'True'.", "tags": null, "constraints": null}], "derived_features": [{"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "probability", "location": {"name": "titanic", "location": "model"}, "dtype": {"name": "float"}}], "transformation": {"name": "map_arg_max", "dtype": {"name": "bool"}, "column_mappings": {"probability": {"name": "bool", "value": true}}}, "depth": 1}], "model_version_column": null, "event_timestamp": null, "source": null, "application_source": null, "stream_source": null, "regression_targets": [], "classification_targets": [{"estimating": {"name": "survived", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, "feature": {"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null}, "on_ground_truth_event": null, "event_trigger": null, "class_probabilities": [{"outcome": {"name": "bool", "value": true}, "feature": {"name": "probability", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null}}], "confidence": null}], "recommendation_targets": [], "acceptable_freshness": 86400.0, "unacceptable_freshness": 172800.0}, "description": "A model predicting if a passenger will survive", "contacts": null, "tags": null, "dataset_store": null, "exposed_at_url": null}], "enrichers": []} diff --git a/test_data/test_model.parquet b/test_data/test_model.parquet index db5f293167fbddbcb3cc3f1fbdaf1ababe276bc5..2faa09fe0db7b01d723a4f4c19fbc69da646f157 100644 GIT binary patch delta 215 zcmX@ba*kz!oG>E;CnEz3C%Xd!69WV=Ppp!j=q)461()H2%7{!fl(L`0$RH^r#vz&^ z1{4Ld<}gB-qAZd&9xO%q`6Z$(45BQmOcG3)DG&x@B7+!*q>QAFT7k^OMiB)L2C)rH zq9P1xEle^%ZK@1xk`j!GqD*2OVg+IuYI7!YFsgC(FzJJUxy592#`)~7j*gB_8IF_h jGKR>e1KI95*^Z93whR(%0r@$JMa2v=85tM?9D@u2_=zOQ delta 239 zcmX@da*AbwoUjN3CnEzFC%Xd!9|HuiOstlg=p!S{43c34$uPlX45jO5F)~QXh;fK! zhyg``tXYf@rYMV~jR#9netwB43xg<&Dw70bB7|njOkoh?kd%?sQ7e!EDpO@(lLShN zGKq1B6^Lc1aWIH&U=kH!P-|j>O9S=5<$*Se9bpm`V-S15C^lJyQHryJNgo8vO(vT& r&S&>@baZqoh?;ztF+?&D$aZ&iWRPGB$j?bEDrT6<$iNWb7-R?lpXnz* diff --git a/test_data/titanic-sets.json b/test_data/titanic-sets.json index 32d63af..986ba66 100644 --- a/test_data/titanic-sets.json +++ b/test_data/titanic-sets.json @@ -1 +1 @@ -{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}, {"name": "optional"}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}, {"name": "optional"}]}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []} +{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}, {"name": "in_domain", "values": ["male", "female"]}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}, {"name": "optional"}]}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []} diff --git a/test_data/titanic-test.csv b/test_data/titanic-test.csv index 4563445..6aac248 100644 --- a/test_data/titanic-test.csv +++ b/test_data/titanic-test.csv @@ -1,21 +1,21 @@ -survived,is_mr,age,sex,sibsp,is_male,cabin,passenger_id,name,has_siblings,is_female -False,True,22.0,male,0,True,,61,"Sirayanian, Mr. Orsen",False,False -True,False,38.0,female,0,False,B28,62,"Icard, Miss. Amelie",False,True -False,True,45.0,male,1,True,C83,63,"Harris, Mr. Henry Birkhardt",True,False -False,False,4.0,male,3,True,,64,"Skoog, Master. Harald",True,False -False,True,,male,0,True,,65,"Stewart, Mr. Albert A",False,False -True,False,,male,1,True,,66,"Moubarek, Master. Gerios",True,False -True,True,29.0,female,0,False,F33,67,"Nye, Mrs. (Elizabeth Ramell)",False,True -False,True,19.0,male,0,True,,68,"Crease, Mr. Ernest James",False,False -True,False,17.0,female,4,False,,69,"Andersson, Miss. Erna Alexandra",True,True -False,True,26.0,male,2,True,,70,"Kink, Mr. Vincenz",True,False -False,True,32.0,male,0,True,,71,"Jenkin, Mr. Stephen Curnow",False,False -False,False,16.0,female,5,False,,72,"Goodwin, Miss. Lillian Amy",True,True -False,True,21.0,male,0,True,,73,"Hood, Mr. Ambrose Jr",False,False -False,True,26.0,male,1,True,,74,"Chronopoulos, Mr. Apostolos",True,False -True,True,32.0,male,0,True,,75,"Bing, Mr. Lee",False,False -False,True,25.0,male,0,True,F G73,76,"Moen, Mr. Sigurd Hansen",False,False -False,True,,male,0,True,,77,"Staneff, Mr. Ivan",False,False -False,True,,male,0,True,,78,"Moutal, Mr. Rahamin Haim",False,False -True,False,0.83,male,0,True,,79,"Caldwell, Master. Alden Gates",False,False -True,False,30.0,female,0,False,,80,"Dowdell, Miss. Elizabeth",False,True +has_siblings,name,sex,cabin,sibsp,age,is_female,is_male,is_mr,survived,passenger_id +False,"Sirayanian, Mr. Orsen",male,,0,22.0,False,True,True,False,61 +False,"Icard, Miss. Amelie",female,B28,0,38.0,True,False,False,True,62 +True,"Harris, Mr. Henry Birkhardt",male,C83,1,45.0,False,True,True,False,63 +True,"Skoog, Master. Harald",male,,3,4.0,False,True,False,False,64 +False,"Stewart, Mr. Albert A",male,,0,,False,True,True,False,65 +True,"Moubarek, Master. Gerios",male,,1,,False,True,False,True,66 +False,"Nye, Mrs. (Elizabeth Ramell)",female,F33,0,29.0,True,False,True,True,67 +False,"Crease, Mr. Ernest James",male,,0,19.0,False,True,True,False,68 +True,"Andersson, Miss. Erna Alexandra",female,,4,17.0,True,False,False,True,69 +True,"Kink, Mr. Vincenz",male,,2,26.0,False,True,True,False,70 +False,"Jenkin, Mr. Stephen Curnow",male,,0,32.0,False,True,True,False,71 +True,"Goodwin, Miss. Lillian Amy",female,,5,16.0,True,False,False,False,72 +False,"Hood, Mr. Ambrose Jr",male,,0,21.0,False,True,True,False,73 +True,"Chronopoulos, Mr. Apostolos",male,,1,26.0,False,True,True,False,74 +False,"Bing, Mr. Lee",male,,0,32.0,False,True,True,True,75 +False,"Moen, Mr. Sigurd Hansen",male,F G73,0,25.0,False,True,True,False,76 +False,"Staneff, Mr. Ivan",male,,0,,False,True,True,False,77 +False,"Moutal, Mr. Rahamin Haim",male,,0,,False,True,True,False,78 +False,"Caldwell, Master. Alden Gates",male,,0,0.83,False,True,False,True,79 +False,"Dowdell, Miss. Elizabeth",female,,0,30.0,True,False,False,True,80 diff --git a/test_data/titanic-train.csv b/test_data/titanic-train.csv index fdcc916..9ad5e1c 100644 --- a/test_data/titanic-train.csv +++ b/test_data/titanic-train.csv @@ -1,61 +1,61 @@ -survived,is_mr,age,sex,sibsp,is_male,cabin,passenger_id,name,has_siblings,is_female -False,True,22.0,male,1,True,,1,"Braund, Mr. Owen Harris",True,False -True,True,38.0,female,1,False,C85,2,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",True,True -True,False,26.0,female,0,False,,3,"Heikkinen, Miss. Laina",False,True -True,True,35.0,female,1,False,C123,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",True,True -False,True,35.0,male,0,True,,5,"Allen, Mr. William Henry",False,False -False,True,,male,0,True,,6,"Moran, Mr. James",False,False -False,True,54.0,other,0,False,E46,7,"McCarthy, Mr. Timothy J",False,False -False,False,2.0,male,3,True,,8,"Palsson, Master. Gosta Leonard",True,False -True,True,27.0,female,0,False,,9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",False,True -True,True,14.0,female,1,False,,10,"Nasser, Mrs. Nicholas (Adele Achem)",True,True -True,False,4.0,female,1,False,G6,11,"Sandstrom, Miss. Marguerite Rut",True,True -True,False,58.0,female,0,False,C103,12,"Bonnell, Miss. Elizabeth",False,True -False,True,20.0,male,0,True,,13,"Saundercock, Mr. William Henry",False,False -False,True,39.0,male,1,True,,14,"Andersson, Mr. Anders Johan",True,False -False,False,14.0,female,0,False,,15,"Vestrom, Miss. Hulda Amanda Adolfina",False,True -True,True,55.0,female,0,False,,16,"Hewlett, Mrs. (Mary D Kingcome) ",False,True -False,False,2.0,male,4,True,,17,"Rice, Master. Eugene",True,False -True,True,,male,0,True,,18,"Williams, Mr. Charles Eugene",False,False -False,True,31.0,female,1,False,,19,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",True,True -True,True,,female,0,False,,20,"Masselmani, Mrs. Fatima",False,True -False,True,35.0,male,0,True,,21,"Fynney, Mr. Joseph J",False,False -True,True,34.0,male,0,True,D56,22,"Beesley, Mr. Lawrence",False,False -True,False,15.0,female,0,False,,23,"McGowan, Miss. Anna ""Annie""",False,True -True,True,28.0,male,0,True,A6,24,"Sloper, Mr. William Thompson",False,False -False,False,8.0,female,3,False,,25,"Palsson, Miss. Torborg Danira",True,True -True,True,38.0,female,1,False,,26,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",True,True -False,True,,male,0,True,,27,"Emir, Mr. Farred Chehab",False,False -False,True,19.0,male,3,True,C23 C25 C27,28,"Fortune, Mr. Charles Alexander",True,False -True,False,,female,0,False,,29,"O'Dwyer, Miss. Ellen ""Nellie""",False,True -False,True,,male,0,True,,30,"Todoroff, Mr. Lalio",False,False -False,False,40.0,male,0,True,,31,"Uruchurtu, Don. Manuel E",False,False -True,True,,female,1,False,B78,32,"Spencer, Mrs. William Augustus (Marie Eugenie)",True,True -True,False,,female,0,False,,33,"Glynn, Miss. Mary Agatha",False,True -False,True,66.0,male,0,True,,34,"Wheadon, Mr. Edward H",False,False -False,True,28.0,male,1,True,,35,"Meyer, Mr. Edgar Joseph",True,False -False,True,42.0,male,1,True,,36,"Holverson, Mr. Alexander Oskar",True,False -True,True,,male,0,True,,37,"Mamee, Mr. Hanna",False,False -False,True,21.0,male,0,True,,38,"Cann, Mr. Ernest Charles",False,False -False,False,18.0,female,2,False,,39,"Vander Planke, Miss. Augusta Maria",True,True -True,False,14.0,female,1,False,,40,"Nicola-Yarred, Miss. Jamila",True,True -False,True,40.0,female,1,False,,41,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",True,True -False,True,27.0,female,1,False,,42,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",True,True -False,True,,male,0,True,,43,"Kraeff, Mr. Theodor",False,False -True,False,3.0,female,1,False,,44,"Laroche, Miss. Simonne Marie Anne Andree",True,True -True,False,19.0,female,0,False,,45,"Devaney, Miss. Margaret Delia",False,True -False,True,,male,0,True,,46,"Rogers, Mr. William John",False,False -False,True,,male,1,True,,47,"Lennon, Mr. Denis",True,False -True,False,,female,0,False,,48,"O'Driscoll, Miss. Bridget",False,True -False,True,,male,2,True,,49,"Samaan, Mr. Youssef",True,False -False,True,18.0,female,1,False,,50,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",True,True -False,False,7.0,male,4,True,,51,"Panula, Master. Juha Niilo",True,False -False,True,21.0,male,0,True,,52,"Nosworthy, Mr. Richard Cater",False,False -True,True,49.0,female,1,False,D33,53,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",True,True -True,True,29.0,female,1,False,,54,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",True,True -False,True,65.0,male,0,True,B30,55,"Ostby, Mr. Engelhart Cornelius",False,False -True,True,,male,0,True,C52,56,"Woolner, Mr. Hugh",False,False -True,False,21.0,female,0,False,,57,"Rugg, Miss. Emily",False,True -False,True,28.5,male,0,True,,58,"Novel, Mr. Mansouer",False,False -True,False,5.0,female,1,False,,59,"West, Miss. Constance Mirium",True,True -False,False,11.0,male,5,True,,60,"Goodwin, Master. William Frederick",True,False +has_siblings,name,sex,cabin,sibsp,age,is_female,is_male,is_mr,survived,passenger_id +True,"Braund, Mr. Owen Harris",male,,1,22.0,False,True,True,False,1 +True,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,C85,1,38.0,True,False,True,True,2 +False,"Heikkinen, Miss. Laina",female,,0,26.0,True,False,False,True,3 +True,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,C123,1,35.0,True,False,True,True,4 +False,"Allen, Mr. William Henry",male,,0,35.0,False,True,True,False,5 +False,"Moran, Mr. James",male,,0,,False,True,True,False,6 +False,"McCarthy, Mr. Timothy J",other,E46,0,54.0,False,False,True,False,7 +True,"Palsson, Master. Gosta Leonard",male,,3,2.0,False,True,False,False,8 +False,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,,0,27.0,True,False,True,True,9 +True,"Nasser, Mrs. Nicholas (Adele Achem)",female,,1,14.0,True,False,True,True,10 +True,"Sandstrom, Miss. Marguerite Rut",female,G6,1,4.0,True,False,False,True,11 +False,"Bonnell, Miss. Elizabeth",female,C103,0,58.0,True,False,False,True,12 +False,"Saundercock, Mr. William Henry",male,,0,20.0,False,True,True,False,13 +True,"Andersson, Mr. Anders Johan",male,,1,39.0,False,True,True,False,14 +False,"Vestrom, Miss. Hulda Amanda Adolfina",female,,0,14.0,True,False,False,False,15 +False,"Hewlett, Mrs. (Mary D Kingcome) ",female,,0,55.0,True,False,True,True,16 +True,"Rice, Master. Eugene",male,,4,2.0,False,True,False,False,17 +False,"Williams, Mr. Charles Eugene",male,,0,,False,True,True,True,18 +True,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,,1,31.0,True,False,True,False,19 +False,"Masselmani, Mrs. Fatima",female,,0,,True,False,True,True,20 +False,"Fynney, Mr. Joseph J",male,,0,35.0,False,True,True,False,21 +False,"Beesley, Mr. Lawrence",male,D56,0,34.0,False,True,True,True,22 +False,"McGowan, Miss. Anna ""Annie""",female,,0,15.0,True,False,False,True,23 +False,"Sloper, Mr. William Thompson",male,A6,0,28.0,False,True,True,True,24 +True,"Palsson, Miss. Torborg Danira",female,,3,8.0,True,False,False,False,25 +True,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",female,,1,38.0,True,False,True,True,26 +False,"Emir, Mr. Farred Chehab",male,,0,,False,True,True,False,27 +True,"Fortune, Mr. Charles Alexander",male,C23 C25 C27,3,19.0,False,True,True,False,28 +False,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,,True,False,False,True,29 +False,"Todoroff, Mr. Lalio",male,,0,,False,True,True,False,30 +False,"Uruchurtu, Don. Manuel E",male,,0,40.0,False,True,False,False,31 +True,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,B78,1,,True,False,True,True,32 +False,"Glynn, Miss. Mary Agatha",female,,0,,True,False,False,True,33 +False,"Wheadon, Mr. Edward H",male,,0,66.0,False,True,True,False,34 +True,"Meyer, Mr. Edgar Joseph",male,,1,28.0,False,True,True,False,35 +True,"Holverson, Mr. Alexander Oskar",male,,1,42.0,False,True,True,False,36 +False,"Mamee, Mr. Hanna",male,,0,,False,True,True,True,37 +False,"Cann, Mr. Ernest Charles",male,,0,21.0,False,True,True,False,38 +True,"Vander Planke, Miss. Augusta Maria",female,,2,18.0,True,False,False,False,39 +True,"Nicola-Yarred, Miss. Jamila",female,,1,14.0,True,False,False,True,40 +True,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,,1,40.0,True,False,True,False,41 +True,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",female,,1,27.0,True,False,True,False,42 +False,"Kraeff, Mr. Theodor",male,,0,,False,True,True,False,43 +True,"Laroche, Miss. Simonne Marie Anne Andree",female,,1,3.0,True,False,False,True,44 +False,"Devaney, Miss. Margaret Delia",female,,0,19.0,True,False,False,True,45 +False,"Rogers, Mr. William John",male,,0,,False,True,True,False,46 +True,"Lennon, Mr. Denis",male,,1,,False,True,True,False,47 +False,"O'Driscoll, Miss. Bridget",female,,0,,True,False,False,True,48 +True,"Samaan, Mr. Youssef",male,,2,,False,True,True,False,49 +True,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,,1,18.0,True,False,True,False,50 +True,"Panula, Master. Juha Niilo",male,,4,7.0,False,True,False,False,51 +False,"Nosworthy, Mr. Richard Cater",male,,0,21.0,False,True,True,False,52 +True,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,D33,1,49.0,True,False,True,True,53 +True,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,,1,29.0,True,False,True,True,54 +False,"Ostby, Mr. Engelhart Cornelius",male,B30,0,65.0,False,True,True,False,55 +False,"Woolner, Mr. Hugh",male,C52,0,,False,True,True,True,56 +False,"Rugg, Miss. Emily",female,,0,21.0,True,False,False,True,57 +False,"Novel, Mr. Mansouer",male,,0,28.5,False,True,True,False,58 +True,"West, Miss. Constance Mirium",female,,1,5.0,True,False,False,True,59 +True,"Goodwin, Master. William Frederick",male,,5,11.0,False,True,False,False,60 diff --git a/test_data/titanic-validate.csv b/test_data/titanic-validate.csv index ec25eb2..6df9c0d 100644 --- a/test_data/titanic-validate.csv +++ b/test_data/titanic-validate.csv @@ -1,21 +1,21 @@ -survived,is_mr,age,sex,sibsp,is_male,cabin,passenger_id,name,has_siblings,is_female -False,True,22.0,male,0,True,,81,"Waelens, Mr. Achille",False,False -True,True,29.0,male,0,True,,82,"Sheerlinck, Mr. Jan Baptist",False,False -True,False,,female,0,False,,83,"McDermott, Miss. Brigdet Delia",False,True -False,True,28.0,male,0,True,,84,"Carrau, Mr. Francisco M",False,False -True,False,17.0,female,0,False,,85,"Ilett, Miss. Bertha",False,True -True,True,33.0,female,3,False,,86,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",True,True -False,True,16.0,male,1,True,,87,"Ford, Mr. William Neal",True,False -False,True,,male,0,True,,88,"Slocovski, Mr. Selman Francis",False,False -True,False,23.0,female,3,False,C23 C25 C27,89,"Fortune, Miss. Mabel Helen",True,True -False,True,24.0,male,0,True,,90,"Celotti, Mr. Francesco",False,False -False,True,29.0,male,0,True,,91,"Christmann, Mr. Emil",False,False -False,True,20.0,male,0,True,,92,"Andreasson, Mr. Paul Edvin",False,False -False,True,46.0,male,1,True,E31,93,"Chaffee, Mr. Herbert Fuller",True,False -False,True,26.0,male,1,True,,94,"Dean, Mr. Bertram Frank",True,False -False,True,59.0,male,0,True,,95,"Coxon, Mr. Daniel",False,False -False,True,,male,0,True,,96,"Shorney, Mr. Charles Joseph",False,False -False,True,71.0,male,0,True,A5,97,"Goldschmidt, Mr. George B",False,False -True,True,23.0,male,0,True,D10 D12,98,"Greenfield, Mr. William Bertram",False,False -True,True,34.0,female,0,False,,99,"Doling, Mrs. John T (Ada Julia Bone)",False,True -False,True,34.0,male,1,True,,100,"Kantor, Mr. Sinai",True,False +has_siblings,name,sex,cabin,sibsp,age,is_female,is_male,is_mr,survived,passenger_id +False,"Waelens, Mr. Achille",male,,0,22.0,False,True,True,False,81 +False,"Sheerlinck, Mr. Jan Baptist",male,,0,29.0,False,True,True,True,82 +False,"McDermott, Miss. Brigdet Delia",female,,0,,True,False,False,True,83 +False,"Carrau, Mr. Francisco M",male,,0,28.0,False,True,True,False,84 +False,"Ilett, Miss. Bertha",female,,0,17.0,True,False,False,True,85 +True,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",female,,3,33.0,True,False,True,True,86 +True,"Ford, Mr. William Neal",male,,1,16.0,False,True,True,False,87 +False,"Slocovski, Mr. Selman Francis",male,,0,,False,True,True,False,88 +True,"Fortune, Miss. Mabel Helen",female,C23 C25 C27,3,23.0,True,False,False,True,89 +False,"Celotti, Mr. Francesco",male,,0,24.0,False,True,True,False,90 +False,"Christmann, Mr. Emil",male,,0,29.0,False,True,True,False,91 +False,"Andreasson, Mr. Paul Edvin",male,,0,20.0,False,True,True,False,92 +True,"Chaffee, Mr. Herbert Fuller",male,E31,1,46.0,False,True,True,False,93 +True,"Dean, Mr. Bertram Frank",male,,1,26.0,False,True,True,False,94 +False,"Coxon, Mr. Daniel",male,,0,59.0,False,True,True,False,95 +False,"Shorney, Mr. Charles Joseph",male,,0,,False,True,True,False,96 +False,"Goldschmidt, Mr. George B",male,A5,0,71.0,False,True,True,False,97 +False,"Greenfield, Mr. William Bertram",male,D10 D12,0,23.0,False,True,True,True,98 +False,"Doling, Mrs. John T (Ada Julia Bone)",female,,0,34.0,True,False,True,True,99 +True,"Kantor, Mr. Sinai",male,,1,34.0,False,True,True,False,100