From 54457089df5abb16b5a414af033ab90d8e698694 Mon Sep 17 00:00:00 2001 From: "Mats E. Mollestad" Date: Sat, 3 Feb 2024 21:37:24 +0100 Subject: [PATCH] Improved datetime handeling --- aligned/data_source/batch_data_source.py | 5 + aligned/local/job.py | 35 ++++++- aligned/retrival_job.py | 45 +++++++-- aligned/schemas/date_formatter.py | 101 +++++++++++++++++++ aligned/sources/local.py | 14 ++- pyproject.toml | 2 +- test_data/credit_history_mater.parquet | Bin 972 -> 978 bytes test_data/feature-store.json | 2 +- test_data/test_model.parquet | Bin 586 -> 586 bytes test_data/titanic-sets.json | 2 +- test_data/titanic-test.csv | 42 ++++---- test_data/titanic-train.csv | 122 +++++++++++------------ test_data/titanic-validate.csv | 42 ++++---- 13 files changed, 293 insertions(+), 119 deletions(-) create mode 100644 aligned/schemas/date_formatter.py diff --git a/aligned/data_source/batch_data_source.py b/aligned/data_source/batch_data_source.py index 84a1866..d33c53d 100644 --- a/aligned/data_source/batch_data_source.py +++ b/aligned/data_source/batch_data_source.py @@ -11,6 +11,7 @@ from aligned.schemas.feature import EventTimestamp, Feature, FeatureLocation from aligned.request.retrival_request import RequestResult, RetrivalRequest from aligned.compiler.feature_factory import FeatureFactory +from polars.type_aliases import TimeUnit if TYPE_CHECKING: from aligned.retrival_job import RetrivalJob @@ -505,6 +506,8 @@ class JoinAsofDataSource(BatchDataSource): left_on: list[str] | None = None right_on: list[str] | None = None + timestamp_unit: TimeUnit = 'us' + type_name: str = 'join_asof' def job_group_key(self) -> str: @@ -525,6 +528,7 @@ def all_with_limit(self, limit: int | None) -> RetrivalJob: right_event_timestamp=self.right_event_timestamp, left_on=self.left_on, right_on=self.right_on, + timestamp_unit=self.timestamp_unit, ) ) @@ -543,6 +547,7 @@ def all_data(self, request: RetrivalRequest, limit: int | None) -> RetrivalJob: right_event_timestamp=self.right_event_timestamp, left_on=self.left_on, right_on=self.right_on, + timestamp_unit=self.timestamp_unit, ) .aggregate(request) .derive_features([request]) diff --git a/aligned/local/job.py b/aligned/local/job.py index 8304c5b..284ab74 100644 --- a/aligned/local/job.py +++ b/aligned/local/job.py @@ -7,7 +7,8 @@ from aligned.request.retrival_request import AggregatedFeature, AggregateOver, RetrivalRequest from aligned.retrival_job import RequestResult, RetrivalJob -from aligned.schemas.feature import Feature +from aligned.schemas.date_formatter import DateFormatter +from aligned.schemas.feature import Feature, FeatureType from aligned.sources.local import DataFileReference @@ -120,12 +121,39 @@ async def aggregate(request: RetrivalRequest, core_data: pl.LazyFrame) -> pl.Laz return results +def decode_timestamps(df: pl.LazyFrame, request: RetrivalRequest, formatter: DateFormatter) -> pl.LazyFrame: + + columns: set[str] = set() + dtypes = dict(zip(df.columns, df.dtypes)) + + for feature in request.all_features: + if ( + feature.dtype == FeatureType.datetime + and feature.name in df.columns + and not isinstance(dtypes[feature.name], pl.Datetime) + ): + columns.add(feature.name) + + if ( + request.event_timestamp + and request.event_timestamp.name in df.columns + and not isinstance(dtypes[request.event_timestamp.name], pl.Datetime) + ): + columns.add(request.event_timestamp.name) + + if not columns: + return df + + return df.with_columns([formatter.decode_polars(column).alias(column) for column in columns]) + + @dataclass class FileFullJob(RetrivalJob): source: DataFileReference request: RetrivalRequest limit: int | None = field(default=None) + date_formatter: DateFormatter = field(default=DateFormatter.iso_8601()) @property def request_result(self) -> RequestResult: @@ -178,6 +206,7 @@ async def file_transform_polars(self, df: pl.LazyFrame) -> pl.LazyFrame: if org_name != wanted_name } df = df.rename(mapping=renames) + df = decode_timestamps(df, self.request, self.date_formatter) if self.request.aggregated_features: df = await aggregate(self.request, df) @@ -202,6 +231,7 @@ class FileDateJob(RetrivalJob): request: RetrivalRequest start_date: datetime end_date: datetime + date_formatter: DateFormatter = field(default=DateFormatter.iso_8601()) @property def request_result(self) -> RequestResult: @@ -250,6 +280,7 @@ def file_transform_polars(self, df: pl.LazyFrame) -> pl.LazyFrame: df = df.rename(mapping=dict(zip(request_features, all_names))) event_timestamp_column = self.request.event_timestamp.name + df = decode_timestamps(df, self.request, self.date_formatter) return df.filter(pl.col(event_timestamp_column).is_between(self.start_date, self.end_date)) @@ -302,6 +333,7 @@ class FileFactualJob(RetrivalJob): source: DataFileReference | RetrivalJob requests: list[RetrivalRequest] facts: RetrivalJob + date_formatter: DateFormatter = field(default=DateFormatter.iso_8601()) @property def request_result(self) -> RequestResult: @@ -387,6 +419,7 @@ async def file_transformations(self, df: pl.LazyFrame) -> pl.LazyFrame: if isinstance(self.source, ColumnFeatureMappable): request_features = self.source.feature_identifier_for(all_names) + df = decode_timestamps(df, request, self.date_formatter) feature_df = df.select(request_features) renames = { diff --git a/aligned/retrival_job.py b/aligned/retrival_job.py index c6d9c1b..9a17ad0 100644 --- a/aligned/retrival_job.py +++ b/aligned/retrival_job.py @@ -1,4 +1,5 @@ from __future__ import annotations +from aligned.schemas.date_formatter import DateFormatter import asyncio import logging @@ -12,6 +13,7 @@ import pandas as pd import polars as pl +from polars.type_aliases import TimeUnit from prometheus_client import Histogram from aligned.exceptions import UnableToFindFileException @@ -493,6 +495,7 @@ def join_asof( right_event_timestamp: str | None = None, left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, + timestamp_unit: TimeUnit = 'us', ) -> RetrivalJob: if isinstance(left_on, str): @@ -518,6 +521,7 @@ def join_asof( right_event_timestamp=right_event_timestamp, left_on=left_on, right_on=right_on, + timestamp_unit=timestamp_unit, ) def join( @@ -610,11 +614,15 @@ def derive_features(self, requests: list[RetrivalRequest] | None = None) -> Retr def combined_features(self, requests: list[RetrivalRequest] | None = None) -> RetrivalJob: return CombineFactualJob([self], requests or self.retrival_requests) - def ensure_types(self, requests: list[RetrivalRequest] | None = None) -> RetrivalJob: + def ensure_types( + self, requests: list[RetrivalRequest] | None = None, date_formatter: DateFormatter | None = None + ) -> RetrivalJob: if not requests: requests = self.retrival_requests - return EnsureTypesJob(job=self, requests=requests) + return EnsureTypesJob( + job=self, requests=requests, date_formatter=date_formatter or DateFormatter.iso_8601() + ) def select_columns(self, include_features: set[str]) -> RetrivalJob: return SelectColumnsJob(include_features, self) @@ -800,6 +808,21 @@ def describe(self) -> str: return f'OnLoadJob {self.on_load} -> {self.job.describe()}' +@dataclass +class EncodeDatesJob(RetrivalJob, ModificationJob): + + job: RetrivalJob + formatter: DateFormatter + columns: list[str] + + async def to_polars(self) -> pl.LazyFrame: + data = await self.job.to_polars() + return data.with_columns([self.formatter.encode_polars(column) for column in self.columns]) + + async def to_pandas(self) -> pd.DataFrame: + return (await self.to_polars()).collect().to_pandas() + + @dataclass class InMemoryCacheJob(RetrivalJob, ModificationJob): @@ -872,6 +895,8 @@ class JoinAsofJob(RetrivalJob): left_on: list[str] | None right_on: list[str] | None + timestamp_unit: TimeUnit = field(default='us') + @property def request_result(self) -> RequestResult: return RequestResult.from_result_list([self.left_job.request_result, self.right_job.request_result]) @@ -884,8 +909,10 @@ async def to_polars(self) -> pl.LazyFrame: left = await self.left_job.to_polars() right = await self.right_job.to_polars() - return left.join_asof( - right, + return left.with_columns( + pl.col(self.left_event_timestamp).dt.cast_time_unit(self.timestamp_unit), + ).join_asof( + right.with_columns(pl.col(self.right_event_timestamp).dt.cast_time_unit(self.timestamp_unit)), by_left=self.left_on, by_right=self.right_on, left_on=self.left_event_timestamp, @@ -1698,6 +1725,7 @@ class EnsureTypesJob(RetrivalJob, ModificationJob): job: RetrivalJob requests: list[RetrivalRequest] + date_formatter: DateFormatter = field(default_factory=DateFormatter.iso_8601) @property def request_result(self) -> RequestResult: @@ -1771,14 +1799,11 @@ async def to_polars(self) -> pl.LazyFrame: df = df.with_columns(pl.col(feature.name).cast(pl.Int8).cast(pl.Boolean)) elif feature.dtype == FeatureType.datetime(): current_dtype = df.select([feature.name]).dtypes[0] + if isinstance(current_dtype, pl.Datetime): continue - # Convert from ms to us - df = df.with_columns( - (pl.col(feature.name).cast(pl.Int64) * 1000) - .cast(pl.Datetime(time_zone='UTC')) - .alias(feature.name) - ) + + df = df.with_columns(self.date_formatter.decode_polars(feature.name)) elif (feature.dtype == FeatureType.array()) or (feature.dtype == FeatureType.embedding()): dtype = df.select(feature.name).dtypes[0] if dtype == pl.Utf8: diff --git a/aligned/schemas/date_formatter.py b/aligned/schemas/date_formatter.py new file mode 100644 index 0000000..0f029e7 --- /dev/null +++ b/aligned/schemas/date_formatter.py @@ -0,0 +1,101 @@ +from __future__ import annotations +from dataclasses import dataclass, field +import polars as pl +from polars.type_aliases import TimeUnit +from aligned.schemas.codable import Codable +from mashumaro.types import SerializableType + + +@dataclass +class AllDateFormatters: + + supported_formatters: dict[str, type[DateFormatter]] + + _shared: AllDateFormatters | None = None + + @classmethod + def shared(cls) -> AllDateFormatters: + if cls._shared is None: + formatters = [ + Timestamp, + StringDateFormatter, + ] + cls._shared = AllDateFormatters({formatter.name(): formatter for formatter in formatters}) + return cls._shared + + +class DateFormatter(Codable, SerializableType): + @classmethod + def name(cls) -> str: + raise NotImplementedError(cls) + + def decode_polars(self, column: str) -> pl.Expr: + raise NotImplementedError(type(self)) + + def encode_polars(self, column: str) -> pl.Expr: + raise NotImplementedError(type(self)) + + def _serialize(self) -> dict: + assert type(self).name() in AllDateFormatters.shared().supported_formatters + data = self.to_dict() + data['name'] = type(self).name() + return data + + @classmethod + def _deserialize(cls, data: dict) -> DateFormatter: + formatter_name = data.pop('name') + formatters = AllDateFormatters.shared().supported_formatters + if formatter_name not in formatters: + raise ValueError( + f"Unknown formatter name: {formatter_name}. Supported formatters: {formatters.keys()}" + ) + formatter_class = formatters[formatter_name] + return formatter_class.from_dict(data) + + @staticmethod + def string_format(format: str) -> StringDateFormatter: + return StringDateFormatter(format) + + @staticmethod + def iso_8601() -> StringDateFormatter: + return StringDateFormatter('yyyy-MM-ddTHH:mm:ssZ') + + @staticmethod + def unix_timestamp(time_unit: TimeUnit = 'us') -> Timestamp: + return Timestamp(time_unit) + + +@dataclass +class Timestamp(DateFormatter): + + time_unit: TimeUnit = field(default='us') + + @classmethod + def name(cls) -> str: + return 'timestamp' + + def decode_polars(self, column: str) -> pl.Expr: + return pl.from_epoch(column, self.time_unit) + + def encode_polars(self, column: str) -> pl.Expr: + return pl.col(column).dt.timestamp(self.time_unit) + + +@dataclass +class StringDateFormatter(DateFormatter): + + date_format: str + time_unit: TimeUnit | None = field(default=None) + time_zone: str | None = field(default=None) + + @classmethod + def name(cls) -> str: + return 'string_form' + + def decode_polars(self, column: str) -> pl.Expr: + return pl.col(column).str.to_datetime( + self.date_format, time_unit=self.time_unit, time_zone=self.time_zone + ) + + def encode_polars(self, column: str) -> pl.Expr: + return pl.col(column).dt.strftime(self.date_format) diff --git a/aligned/sources/local.py b/aligned/sources/local.py index 43996ad..d3545f7 100644 --- a/aligned/sources/local.py +++ b/aligned/sources/local.py @@ -24,6 +24,7 @@ from aligned.storage import Storage from aligned.feature_store import FeatureStore from aligned.feature_source import WritableFeatureSource +from aligned.schemas.date_formatter import DateFormatter if TYPE_CHECKING: from aligned.compiler.feature_factory import FeatureFactory @@ -100,6 +101,7 @@ class CsvFileSource(BatchDataSource, ColumnFeatureMappable, StatisticEricher, Da path: str mapping_keys: dict[str, str] = field(default_factory=dict) csv_config: CsvConfig = field(default_factory=CsvConfig) + formatter: DateFormatter = field(default_factory=DateFormatter.iso_8601) type_name: str = 'csv' @@ -489,9 +491,17 @@ def json_at(path: str) -> StorageFileSource: @staticmethod def csv_at( - path: str, mapping_keys: dict[str, str] | None = None, csv_config: CsvConfig | None = None + path: str, + mapping_keys: dict[str, str] | None = None, + csv_config: CsvConfig | None = None, + date_formatter: DateFormatter | None = None, ) -> CsvFileSource: - return CsvFileSource(path, mapping_keys=mapping_keys or {}, csv_config=csv_config or CsvConfig()) + return CsvFileSource( + path, + mapping_keys=mapping_keys or {}, + csv_config=csv_config or CsvConfig(), + formatter=date_formatter or DateFormatter.iso_8601(), + ) @staticmethod def parquet_at( diff --git a/pyproject.toml b/pyproject.toml index f1ecf5c..dda7fef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "aligned" -version = "0.0.66" +version = "0.0.67" description = "A data managment and lineage tool for ML applications." authors = ["Mats E. Mollestad "] license = "Apache-2.0" diff --git a/test_data/credit_history_mater.parquet b/test_data/credit_history_mater.parquet index 3ff9af6471ccb920bbeea5fc6d40c235956b8a9a..02e451b995f12fa60d0541ca35beb91100a35fbb 100644 GIT binary patch delta 571 zcmaJ*&rcIU7@Zk5+XXbF2{Y`Ho(!FMAYp&d?$&6s(_mYO7P^`)B_^d)EE`yCLNyXE z@CTG&vlqR2<7{G3!_AWi;=zOGdiPJz#u?z^#V?unz3+W9@6AQ(d+PPA^rv$lrsj{v zFMJ^D^!=buH4R`D@8HeB`7HJF=Az{^?VRI|r*G#R)3yrDoaGj7rsr(av+_;Pb=+hG zb+-@c8I(^KasG*Dk^-z?kOp8ZrDtTo@bl1z+Pt~_)0ZPS_~<8ZVbj}n| zB^0H|iX{Q}8G9t{L=PCd)E~+vV@sZlE?!<~i@CVV?Uhd3slK@Odnv^3KZZ<5|1r!< zLy=KttigowV;R$MR^p9vY_8kYsJ$0-HY)Xa-Kko<#_j=KUF*iJ{T?_$0IadV`U_s@ Bjky2- delta 558 zcmcb_eujO5l3>Ml2S#?C2Rk1)FaUu;`ooFu)l3r9I2c6PL`_6ZVt^Eg$so!i${@(V zz|1bez`)4Jv3w&)8^ey>U~LVYXB;NaW>n?e#RyWIB^oy|QA*X+$iTwTEZ)N0*ql>P z!NSbI#Lzh2$W%~J!_?To$j~z0)X>~=;z9}kE=ILiObn7TVjQ9gVn9~_S-TkF%vVgJ zU5p@MQ5H!b50;|*{1Q<%22n0mCJE8vlG2payps5w{KUNYl+x7r+|RD?lo7SrTejB30(bb~R(4Pv)gL?sx+ zt}%(Rsd0#%ViEy4MuNvND9AtDsyI0#H8)Y>2a`SsRCrGIW^&O8bp(PS5aAdI#4g1k zIUf+$8ALcng2|x3$=jJ^rSt=UlI4MpPL7TqAn_=7%ghL4zpAA4$@7@p>Rro$V(xyG zQQ<)aDQ=cokr7_m5k^6VQDK%H?&IW_p5*1=6lin?Z&F zK|m2ulLgQ*k&aI3j;@YQAQwATrFev8M}?ajL>N_+MTVQEMR|lt=R_D8=0t{@I5J4E U1?1->78NseGchm(I0hL40Et_R_W%F@ diff --git a/test_data/feature-store.json b/test_data/feature-store.json index b138079..b05e14b 100644 --- a/test_data/feature-store.json +++ b/test_data/feature-store.json @@ -1 +1 @@ -{"metadata": {"created_at": "2024-01-28T13:07:54.895851", "name": "feature_store_location.py", "repo_url": null, "github_url": null}, "feature_views": [{"name": "titanic", "tags": {}, "source": {"mapping_keys": {"PassengerId": "passenger_id", "Age": "age", "Sex": "sex", "Survived": "survived", "SibSp": "sibsp", "UpdatedAt": "updated_at"}, "type_name": "csv", "path": "test_data/titanic_scd_data.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "updated_at", "dtype": {"name": "datetime"}, "description": null, "tags": null, "constraints": null}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}], "derived_features": [{"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "square_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul", "dtype": {"name": "float"}, "front": "sibsp", "behind": "sibsp"}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "word_vectoriser", "dtype": {"name": "embedding"}, "key": "name", "model": {"name": "gensim", "model_name": "glove-wiki-gigaword-50", "config": {"to_lowercase": false, "deaccent": false, "encoding": "utf8", "errors": "strict"}, "loaded_model": null}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "double_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul_val", "dtype": {"name": "float"}, "key": "sibsp", "value": {"name": "int", "value": 2}}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": {"name": "updated_at", "ttl": null, "description": null, "tags": null, "dtype": {"name": "datetime"}}, "stream_data_source": {"mapping_keys": {}, "name": "redis", "topic_name": "titanic_stream", "config": {"env_var": "REDIS_URL"}, "record_coder": {"coder_type": "json", "key": "json"}}, "application_source": null, "materialized_source": null, "event_triggers": null, "contacts": null, "indexes": [{"location": {"name": "titanic", "location": "feature_view"}, "vector": {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null}, "vector_dim": 50, "metadata": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "storage": {"type_name": "redis", "config": {"env_var": "REDIS_URL"}, "name": "name_embedding_index", "initial_cap": 10000, "distance_metric": "COSINE", "index_alogrithm": "FLAT", "embedding_type": "FLOAT32"}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}]}]}, {"name": "titanic_parquet", "tags": {}, "source": {"mapping_keys": {}, "type_name": "parquet", "path": "test_data/titanic.parquet", "config": {"engine": "auto", "compression": "snappy", "should_write_index": false}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}], "derived_features": [{"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": null, "stream_data_source": null, "application_source": null, "materialized_source": null, "event_triggers": null, "contacts": null, "indexes": []}], "combined_feature_views": [], "models": [{"name": "titanic", "features": {"default_version": "default", "versions": {"default": [{"name": "age", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "float"}}, {"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}, {"name": "has_siblings", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, {"name": "is_male", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}]}}, "predictions_view": {"entities": [], "features": [{"name": "probability", "dtype": {"name": "float"}, "description": "The probability of target named will_survive being 'True'.", "tags": null, "constraints": null}], "derived_features": [{"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "probability", "location": {"name": "titanic", "location": "model"}, "dtype": {"name": "float"}}], "transformation": {"name": "map_arg_max", "dtype": {"name": "bool"}, "column_mappings": {"probability": {"name": "bool", "value": true}}}, "depth": 1}], "model_version_column": null, "event_timestamp": null, "source": null, "application_source": null, "stream_source": null, "regression_targets": [], "classification_targets": [{"estimating": {"name": "survived", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, "feature": {"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null}, "on_ground_truth_event": null, "event_trigger": null, "class_probabilities": [{"outcome": {"name": "bool", "value": true}, "feature": {"name": "probability", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null}}], "confidence": null}], "recommendation_targets": []}, "description": "A model predicting if a passenger will survive", "contacts": null, "tags": null, "dataset_store": null, "exposed_at_url": null}], "enrichers": []} +{"metadata": {"created_at": "2024-02-03T20:34:56.568052", "name": "feature_store_location.py", "repo_url": null, "github_url": null}, "feature_views": [{"name": "titanic_parquet", "tags": {}, "source": {"mapping_keys": {}, "type_name": "parquet", "path": "test_data/titanic.parquet", "config": {"engine": "auto", "compression": "snappy", "should_write_index": false}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}], "derived_features": [{"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": null, "stream_data_source": null, "application_source": null, "materialized_source": null, "event_triggers": null, "contacts": null, "indexes": []}, {"name": "titanic", "tags": {}, "source": {"mapping_keys": {"PassengerId": "passenger_id", "Age": "age", "Sex": "sex", "Survived": "survived", "SibSp": "sibsp", "UpdatedAt": "updated_at"}, "type_name": "csv", "path": "test_data/titanic_scd_data.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}]}, {"name": "updated_at", "dtype": {"name": "datetime"}, "description": null, "tags": null, "constraints": null}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}], "derived_features": [{"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "word_vectoriser", "dtype": {"name": "embedding"}, "key": "name", "model": {"name": "gensim", "model_name": "glove-wiki-gigaword-50", "config": {"to_lowercase": false, "deaccent": false, "encoding": "utf8", "errors": "strict"}, "loaded_model": null}}, "depth": 1}, {"name": "square_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul", "dtype": {"name": "float"}, "front": "sibsp", "behind": "sibsp"}, "depth": 1}, {"name": "double_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul_val", "dtype": {"name": "float"}, "key": "sibsp", "value": {"name": "int", "value": 2}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": {"name": "updated_at", "ttl": null, "description": null, "tags": null, "dtype": {"name": "datetime"}}, "stream_data_source": {"mapping_keys": {}, "name": "redis", "topic_name": "titanic_stream", "config": {"env_var": "REDIS_URL"}, "record_coder": {"coder_type": "json", "key": "json"}}, "application_source": null, "materialized_source": null, "event_triggers": null, "contacts": null, "indexes": [{"location": {"name": "titanic", "location": "feature_view"}, "vector": {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null}, "vector_dim": 50, "metadata": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "storage": {"type_name": "redis", "config": {"env_var": "REDIS_URL"}, "name": "name_embedding_index", "initial_cap": 10000, "distance_metric": "COSINE", "index_alogrithm": "FLAT", "embedding_type": "FLOAT32"}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}]}]}], "combined_feature_views": [], "models": [{"name": "titanic", "features": {"default_version": "default", "versions": {"default": [{"name": "age", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "float"}}, {"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}, {"name": "has_siblings", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, {"name": "is_male", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}]}}, "predictions_view": {"entities": [], "features": [{"name": "probability", "dtype": {"name": "float"}, "description": "The probability of target named will_survive being 'True'.", "tags": null, "constraints": null}], "derived_features": [{"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "probability", "location": {"name": "titanic", "location": "model"}, "dtype": {"name": "float"}}], "transformation": {"name": "map_arg_max", "dtype": {"name": "bool"}, "column_mappings": {"probability": {"name": "bool", "value": true}}}, "depth": 1}], "model_version_column": null, "event_timestamp": null, "source": null, "application_source": null, "stream_source": null, "regression_targets": [], "classification_targets": [{"estimating": {"name": "survived", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, "feature": {"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null}, "on_ground_truth_event": null, "event_trigger": null, "class_probabilities": [{"outcome": {"name": "bool", "value": true}, "feature": {"name": "probability", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null}}], "confidence": null}], "recommendation_targets": []}, "description": "A model predicting if a passenger will survive", "contacts": null, "tags": null, "dataset_store": null, "exposed_at_url": null}], "enrichers": []} diff --git a/test_data/test_model.parquet b/test_data/test_model.parquet index db5f293167fbddbcb3cc3f1fbdaf1ababe276bc5..78c31e37e224318483cea6540eca344cf8e7a53b 100644 GIT binary patch delta 37 ocmX@ba*Abw95){WCnEzFC%XfK$V5#$ZYB_y5yWNQSagFC0C_J4I{*Lx delta 37 ocmX@ba*Abw9JdGqCnEzFC%Xd!-$YG2Ze|dd5yWNMSagFC0C{x=I{*Lx diff --git a/test_data/titanic-sets.json b/test_data/titanic-sets.json index 69f587b..32c55a9 100644 --- a/test_data/titanic-sets.json +++ b/test_data/titanic-sets.json @@ -1 +1 @@ -{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}, {"name": "in_domain", "values": ["male", "female"]}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "optional"}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []} +{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}, {"name": "optional"}]}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}, {"name": "optional"}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []} diff --git a/test_data/titanic-test.csv b/test_data/titanic-test.csv index 809d48f..741ec4d 100644 --- a/test_data/titanic-test.csv +++ b/test_data/titanic-test.csv @@ -1,21 +1,21 @@ -passenger_id,cabin,sex,sibsp,is_mr,has_siblings,is_male,name,is_female,age,survived -61,,male,0,True,False,True,"Sirayanian, Mr. Orsen",False,22.0,False -62,B28,female,0,False,False,False,"Icard, Miss. Amelie",True,38.0,True -63,C83,male,1,True,True,True,"Harris, Mr. Henry Birkhardt",False,45.0,False -64,,male,3,False,True,True,"Skoog, Master. Harald",False,4.0,False -65,,male,0,True,False,True,"Stewart, Mr. Albert A",False,,False -66,,male,1,False,True,True,"Moubarek, Master. Gerios",False,,True -67,F33,female,0,True,False,False,"Nye, Mrs. (Elizabeth Ramell)",True,29.0,True -68,,male,0,True,False,True,"Crease, Mr. Ernest James",False,19.0,False -69,,female,4,False,True,False,"Andersson, Miss. Erna Alexandra",True,17.0,True -70,,male,2,True,True,True,"Kink, Mr. Vincenz",False,26.0,False -71,,male,0,True,False,True,"Jenkin, Mr. Stephen Curnow",False,32.0,False -72,,female,5,False,True,False,"Goodwin, Miss. Lillian Amy",True,16.0,False -73,,male,0,True,False,True,"Hood, Mr. Ambrose Jr",False,21.0,False -74,,male,1,True,True,True,"Chronopoulos, Mr. Apostolos",False,26.0,False -75,,male,0,True,False,True,"Bing, Mr. Lee",False,32.0,True -76,F G73,male,0,True,False,True,"Moen, Mr. Sigurd Hansen",False,25.0,False -77,,male,0,True,False,True,"Staneff, Mr. Ivan",False,,False -78,,male,0,True,False,True,"Moutal, Mr. Rahamin Haim",False,,False -79,,male,0,False,False,True,"Caldwell, Master. Alden Gates",False,0.83,True -80,,female,0,False,False,False,"Dowdell, Miss. Elizabeth",True,30.0,True +is_mr,passenger_id,age,cabin,is_female,sibsp,is_male,survived,has_siblings,sex,name +True,61,22.0,,False,0,True,False,False,male,"Sirayanian, Mr. Orsen" +False,62,38.0,B28,True,0,False,True,False,female,"Icard, Miss. Amelie" +True,63,45.0,C83,False,1,True,False,True,male,"Harris, Mr. Henry Birkhardt" +False,64,4.0,,False,3,True,False,True,male,"Skoog, Master. Harald" +True,65,,,False,0,True,False,False,male,"Stewart, Mr. Albert A" +False,66,,,False,1,True,True,True,male,"Moubarek, Master. Gerios" +True,67,29.0,F33,True,0,False,True,False,female,"Nye, Mrs. (Elizabeth Ramell)" +True,68,19.0,,False,0,True,False,False,male,"Crease, Mr. Ernest James" +False,69,17.0,,True,4,False,True,True,female,"Andersson, Miss. Erna Alexandra" +True,70,26.0,,False,2,True,False,True,male,"Kink, Mr. Vincenz" +True,71,32.0,,False,0,True,False,False,male,"Jenkin, Mr. Stephen Curnow" +False,72,16.0,,True,5,False,False,True,female,"Goodwin, Miss. Lillian Amy" +True,73,21.0,,False,0,True,False,False,male,"Hood, Mr. Ambrose Jr" +True,74,26.0,,False,1,True,False,True,male,"Chronopoulos, Mr. Apostolos" +True,75,32.0,,False,0,True,True,False,male,"Bing, Mr. Lee" +True,76,25.0,F G73,False,0,True,False,False,male,"Moen, Mr. Sigurd Hansen" +True,77,,,False,0,True,False,False,male,"Staneff, Mr. Ivan" +True,78,,,False,0,True,False,False,male,"Moutal, Mr. Rahamin Haim" +False,79,0.83,,False,0,True,True,False,male,"Caldwell, Master. Alden Gates" +False,80,30.0,,True,0,False,True,False,female,"Dowdell, Miss. Elizabeth" diff --git a/test_data/titanic-train.csv b/test_data/titanic-train.csv index 44ddec1..925bdb7 100644 --- a/test_data/titanic-train.csv +++ b/test_data/titanic-train.csv @@ -1,61 +1,61 @@ -passenger_id,cabin,sex,sibsp,is_mr,has_siblings,is_male,name,is_female,age,survived -1,,male,1,True,True,True,"Braund, Mr. Owen Harris",False,22.0,False -2,C85,female,1,True,True,False,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",True,38.0,True -3,,female,0,False,False,False,"Heikkinen, Miss. Laina",True,26.0,True -4,C123,female,1,True,True,False,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",True,35.0,True -5,,male,0,True,False,True,"Allen, Mr. William Henry",False,35.0,False -6,,male,0,True,False,True,"Moran, Mr. James",False,,False -7,E46,other,0,True,False,False,"McCarthy, Mr. Timothy J",False,54.0,False -8,,male,3,False,True,True,"Palsson, Master. Gosta Leonard",False,2.0,False -9,,female,0,True,False,False,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",True,27.0,True -10,,female,1,True,True,False,"Nasser, Mrs. Nicholas (Adele Achem)",True,14.0,True -11,G6,female,1,False,True,False,"Sandstrom, Miss. Marguerite Rut",True,4.0,True -12,C103,female,0,False,False,False,"Bonnell, Miss. Elizabeth",True,58.0,True -13,,male,0,True,False,True,"Saundercock, Mr. William Henry",False,20.0,False -14,,male,1,True,True,True,"Andersson, Mr. Anders Johan",False,39.0,False -15,,female,0,False,False,False,"Vestrom, Miss. Hulda Amanda Adolfina",True,14.0,False -16,,female,0,True,False,False,"Hewlett, Mrs. (Mary D Kingcome) ",True,55.0,True -17,,male,4,False,True,True,"Rice, Master. Eugene",False,2.0,False -18,,male,0,True,False,True,"Williams, Mr. Charles Eugene",False,,True -19,,female,1,True,True,False,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",True,31.0,False -20,,female,0,True,False,False,"Masselmani, Mrs. Fatima",True,,True -21,,male,0,True,False,True,"Fynney, Mr. Joseph J",False,35.0,False -22,D56,male,0,True,False,True,"Beesley, Mr. Lawrence",False,34.0,True -23,,female,0,False,False,False,"McGowan, Miss. Anna ""Annie""",True,15.0,True -24,A6,male,0,True,False,True,"Sloper, Mr. William Thompson",False,28.0,True -25,,female,3,False,True,False,"Palsson, Miss. Torborg Danira",True,8.0,False -26,,female,1,True,True,False,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",True,38.0,True -27,,male,0,True,False,True,"Emir, Mr. Farred Chehab",False,,False -28,C23 C25 C27,male,3,True,True,True,"Fortune, Mr. Charles Alexander",False,19.0,False -29,,female,0,False,False,False,"O'Dwyer, Miss. Ellen ""Nellie""",True,,True -30,,male,0,True,False,True,"Todoroff, Mr. Lalio",False,,False -31,,male,0,False,False,True,"Uruchurtu, Don. Manuel E",False,40.0,False -32,B78,female,1,True,True,False,"Spencer, Mrs. William Augustus (Marie Eugenie)",True,,True -33,,female,0,False,False,False,"Glynn, Miss. Mary Agatha",True,,True -34,,male,0,True,False,True,"Wheadon, Mr. Edward H",False,66.0,False -35,,male,1,True,True,True,"Meyer, Mr. Edgar Joseph",False,28.0,False -36,,male,1,True,True,True,"Holverson, Mr. Alexander Oskar",False,42.0,False -37,,male,0,True,False,True,"Mamee, Mr. Hanna",False,,True -38,,male,0,True,False,True,"Cann, Mr. Ernest Charles",False,21.0,False -39,,female,2,False,True,False,"Vander Planke, Miss. Augusta Maria",True,18.0,False -40,,female,1,False,True,False,"Nicola-Yarred, Miss. Jamila",True,14.0,True -41,,female,1,True,True,False,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",True,40.0,False -42,,female,1,True,True,False,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",True,27.0,False -43,,male,0,True,False,True,"Kraeff, Mr. Theodor",False,,False -44,,female,1,False,True,False,"Laroche, Miss. Simonne Marie Anne Andree",True,3.0,True -45,,female,0,False,False,False,"Devaney, Miss. Margaret Delia",True,19.0,True -46,,male,0,True,False,True,"Rogers, Mr. William John",False,,False -47,,male,1,True,True,True,"Lennon, Mr. Denis",False,,False -48,,female,0,False,False,False,"O'Driscoll, Miss. Bridget",True,,True -49,,male,2,True,True,True,"Samaan, Mr. Youssef",False,,False -50,,female,1,True,True,False,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",True,18.0,False -51,,male,4,False,True,True,"Panula, Master. Juha Niilo",False,7.0,False -52,,male,0,True,False,True,"Nosworthy, Mr. Richard Cater",False,21.0,False -53,D33,female,1,True,True,False,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",True,49.0,True -54,,female,1,True,True,False,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",True,29.0,True -55,B30,male,0,True,False,True,"Ostby, Mr. Engelhart Cornelius",False,65.0,False -56,C52,male,0,True,False,True,"Woolner, Mr. Hugh",False,,True -57,,female,0,False,False,False,"Rugg, Miss. Emily",True,21.0,True -58,,male,0,True,False,True,"Novel, Mr. Mansouer",False,28.5,False -59,,female,1,False,True,False,"West, Miss. Constance Mirium",True,5.0,True -60,,male,5,False,True,True,"Goodwin, Master. William Frederick",False,11.0,False +is_mr,passenger_id,age,cabin,is_female,sibsp,is_male,survived,has_siblings,sex,name +True,1,22.0,,False,1,True,False,True,male,"Braund, Mr. Owen Harris" +True,2,38.0,C85,True,1,False,True,True,female,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)" +False,3,26.0,,True,0,False,True,False,female,"Heikkinen, Miss. Laina" +True,4,35.0,C123,True,1,False,True,True,female,"Futrelle, Mrs. Jacques Heath (Lily May Peel)" +True,5,35.0,,False,0,True,False,False,male,"Allen, Mr. William Henry" +True,6,,,False,0,True,False,False,male,"Moran, Mr. James" +True,7,54.0,E46,False,0,False,False,False,other,"McCarthy, Mr. Timothy J" +False,8,2.0,,False,3,True,False,True,male,"Palsson, Master. Gosta Leonard" +True,9,27.0,,True,0,False,True,False,female,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)" +True,10,14.0,,True,1,False,True,True,female,"Nasser, Mrs. Nicholas (Adele Achem)" +False,11,4.0,G6,True,1,False,True,True,female,"Sandstrom, Miss. Marguerite Rut" +False,12,58.0,C103,True,0,False,True,False,female,"Bonnell, Miss. Elizabeth" +True,13,20.0,,False,0,True,False,False,male,"Saundercock, Mr. William Henry" +True,14,39.0,,False,1,True,False,True,male,"Andersson, Mr. Anders Johan" +False,15,14.0,,True,0,False,False,False,female,"Vestrom, Miss. Hulda Amanda Adolfina" +True,16,55.0,,True,0,False,True,False,female,"Hewlett, Mrs. (Mary D Kingcome) " +False,17,2.0,,False,4,True,False,True,male,"Rice, Master. Eugene" +True,18,,,False,0,True,True,False,male,"Williams, Mr. Charles Eugene" +True,19,31.0,,True,1,False,False,True,female,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)" +True,20,,,True,0,False,True,False,female,"Masselmani, Mrs. Fatima" +True,21,35.0,,False,0,True,False,False,male,"Fynney, Mr. Joseph J" +True,22,34.0,D56,False,0,True,True,False,male,"Beesley, Mr. Lawrence" +False,23,15.0,,True,0,False,True,False,female,"McGowan, Miss. Anna ""Annie""" +True,24,28.0,A6,False,0,True,True,False,male,"Sloper, Mr. William Thompson" +False,25,8.0,,True,3,False,False,True,female,"Palsson, Miss. Torborg Danira" +True,26,38.0,,True,1,False,True,True,female,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)" +True,27,,,False,0,True,False,False,male,"Emir, Mr. Farred Chehab" +True,28,19.0,C23 C25 C27,False,3,True,False,True,male,"Fortune, Mr. Charles Alexander" +False,29,,,True,0,False,True,False,female,"O'Dwyer, Miss. Ellen ""Nellie""" +True,30,,,False,0,True,False,False,male,"Todoroff, Mr. Lalio" +False,31,40.0,,False,0,True,False,False,male,"Uruchurtu, Don. Manuel E" +True,32,,B78,True,1,False,True,True,female,"Spencer, Mrs. William Augustus (Marie Eugenie)" +False,33,,,True,0,False,True,False,female,"Glynn, Miss. Mary Agatha" +True,34,66.0,,False,0,True,False,False,male,"Wheadon, Mr. Edward H" +True,35,28.0,,False,1,True,False,True,male,"Meyer, Mr. Edgar Joseph" +True,36,42.0,,False,1,True,False,True,male,"Holverson, Mr. Alexander Oskar" +True,37,,,False,0,True,True,False,male,"Mamee, Mr. Hanna" +True,38,21.0,,False,0,True,False,False,male,"Cann, Mr. Ernest Charles" +False,39,18.0,,True,2,False,False,True,female,"Vander Planke, Miss. Augusta Maria" +False,40,14.0,,True,1,False,True,True,female,"Nicola-Yarred, Miss. Jamila" +True,41,40.0,,True,1,False,False,True,female,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)" +True,42,27.0,,True,1,False,False,True,female,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)" +True,43,,,False,0,True,False,False,male,"Kraeff, Mr. Theodor" +False,44,3.0,,True,1,False,True,True,female,"Laroche, Miss. Simonne Marie Anne Andree" +False,45,19.0,,True,0,False,True,False,female,"Devaney, Miss. Margaret Delia" +True,46,,,False,0,True,False,False,male,"Rogers, Mr. William John" +True,47,,,False,1,True,False,True,male,"Lennon, Mr. Denis" +False,48,,,True,0,False,True,False,female,"O'Driscoll, Miss. Bridget" +True,49,,,False,2,True,False,True,male,"Samaan, Mr. Youssef" +True,50,18.0,,True,1,False,False,True,female,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)" +False,51,7.0,,False,4,True,False,True,male,"Panula, Master. Juha Niilo" +True,52,21.0,,False,0,True,False,False,male,"Nosworthy, Mr. Richard Cater" +True,53,49.0,D33,True,1,False,True,True,female,"Harper, Mrs. Henry Sleeper (Myna Haxtun)" +True,54,29.0,,True,1,False,True,True,female,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)" +True,55,65.0,B30,False,0,True,False,False,male,"Ostby, Mr. Engelhart Cornelius" +True,56,,C52,False,0,True,True,False,male,"Woolner, Mr. Hugh" +False,57,21.0,,True,0,False,True,False,female,"Rugg, Miss. Emily" +True,58,28.5,,False,0,True,False,False,male,"Novel, Mr. Mansouer" +False,59,5.0,,True,1,False,True,True,female,"West, Miss. Constance Mirium" +False,60,11.0,,False,5,True,False,True,male,"Goodwin, Master. William Frederick" diff --git a/test_data/titanic-validate.csv b/test_data/titanic-validate.csv index 451e0d3..eea2faf 100644 --- a/test_data/titanic-validate.csv +++ b/test_data/titanic-validate.csv @@ -1,21 +1,21 @@ -passenger_id,cabin,sex,sibsp,is_mr,has_siblings,is_male,name,is_female,age,survived -81,,male,0,True,False,True,"Waelens, Mr. Achille",False,22.0,False -82,,male,0,True,False,True,"Sheerlinck, Mr. Jan Baptist",False,29.0,True -83,,female,0,False,False,False,"McDermott, Miss. Brigdet Delia",True,,True -84,,male,0,True,False,True,"Carrau, Mr. Francisco M",False,28.0,False -85,,female,0,False,False,False,"Ilett, Miss. Bertha",True,17.0,True -86,,female,3,True,True,False,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",True,33.0,True -87,,male,1,True,True,True,"Ford, Mr. William Neal",False,16.0,False -88,,male,0,True,False,True,"Slocovski, Mr. Selman Francis",False,,False -89,C23 C25 C27,female,3,False,True,False,"Fortune, Miss. Mabel Helen",True,23.0,True -90,,male,0,True,False,True,"Celotti, Mr. Francesco",False,24.0,False -91,,male,0,True,False,True,"Christmann, Mr. Emil",False,29.0,False -92,,male,0,True,False,True,"Andreasson, Mr. Paul Edvin",False,20.0,False -93,E31,male,1,True,True,True,"Chaffee, Mr. Herbert Fuller",False,46.0,False -94,,male,1,True,True,True,"Dean, Mr. Bertram Frank",False,26.0,False -95,,male,0,True,False,True,"Coxon, Mr. Daniel",False,59.0,False -96,,male,0,True,False,True,"Shorney, Mr. Charles Joseph",False,,False -97,A5,male,0,True,False,True,"Goldschmidt, Mr. George B",False,71.0,False -98,D10 D12,male,0,True,False,True,"Greenfield, Mr. William Bertram",False,23.0,True -99,,female,0,True,False,False,"Doling, Mrs. John T (Ada Julia Bone)",True,34.0,True -100,,male,1,True,True,True,"Kantor, Mr. Sinai",False,34.0,False +is_mr,passenger_id,age,cabin,is_female,sibsp,is_male,survived,has_siblings,sex,name +True,81,22.0,,False,0,True,False,False,male,"Waelens, Mr. Achille" +True,82,29.0,,False,0,True,True,False,male,"Sheerlinck, Mr. Jan Baptist" +False,83,,,True,0,False,True,False,female,"McDermott, Miss. Brigdet Delia" +True,84,28.0,,False,0,True,False,False,male,"Carrau, Mr. Francisco M" +False,85,17.0,,True,0,False,True,False,female,"Ilett, Miss. Bertha" +True,86,33.0,,True,3,False,True,True,female,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)" +True,87,16.0,,False,1,True,False,True,male,"Ford, Mr. William Neal" +True,88,,,False,0,True,False,False,male,"Slocovski, Mr. Selman Francis" +False,89,23.0,C23 C25 C27,True,3,False,True,True,female,"Fortune, Miss. Mabel Helen" +True,90,24.0,,False,0,True,False,False,male,"Celotti, Mr. Francesco" +True,91,29.0,,False,0,True,False,False,male,"Christmann, Mr. Emil" +True,92,20.0,,False,0,True,False,False,male,"Andreasson, Mr. Paul Edvin" +True,93,46.0,E31,False,1,True,False,True,male,"Chaffee, Mr. Herbert Fuller" +True,94,26.0,,False,1,True,False,True,male,"Dean, Mr. Bertram Frank" +True,95,59.0,,False,0,True,False,False,male,"Coxon, Mr. Daniel" +True,96,,,False,0,True,False,False,male,"Shorney, Mr. Charles Joseph" +True,97,71.0,A5,False,0,True,False,False,male,"Goldschmidt, Mr. George B" +True,98,23.0,D10 D12,False,0,True,True,False,male,"Greenfield, Mr. William Bertram" +True,99,34.0,,True,0,False,True,False,female,"Doling, Mrs. John T (Ada Julia Bone)" +True,100,34.0,,False,1,True,False,True,male,"Kantor, Mr. Sinai"