diff --git a/aligned/active_learning/job.py b/aligned/active_learning/job.py index 9bbd781..e0cd027 100644 --- a/aligned/active_learning/job.py +++ b/aligned/active_learning/job.py @@ -21,12 +21,12 @@ class ActiveLearningJob(RetrivalJob): selection: ActiveLearningSelection write_policy: ActiveLearningWritePolicy - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: if not self.model.predictions_view.classification_targets: logger.info('Found no target. Therefore, no data will be written to an active learning dataset.') - return await self.job.to_polars() + return await self.job.to_lazy_polars() - data = await self.job.to_polars() + data = await self.job.to_lazy_polars() active_learning_set = self.selection.select(self.model, data, self.metric) await self.write_policy.write(active_learning_set, self.model) return data diff --git a/aligned/compiler/model.py b/aligned/compiler/model.py index 4e549cd..94e4939 100644 --- a/aligned/compiler/model.py +++ b/aligned/compiler/model.py @@ -4,6 +4,7 @@ import logging from dataclasses import dataclass, field from typing import Any, Callable, Type, TypeVar, Generic, TYPE_CHECKING +from datetime import timedelta from uuid import uuid4 @@ -54,6 +55,9 @@ class ModelMetadata: prediction_stream: StreamDataSource | None = field(default=None) application_source: BatchDataSource | None = field(default=None) + acceptable_freshness: timedelta | None = field(default=None) + unacceptable_freshness: timedelta | None = field(default=None) + exposed_at_url: str | None = field(default=None) dataset_store: DatasetStore | None = field(default=None) @@ -171,6 +175,8 @@ def model_contract( application_source: BatchDataSource | None = None, dataset_store: DatasetStore | StorageFileReference | None = None, exposed_at_url: str | None = None, + acceptable_freshness: timedelta | None = None, + unacceptable_freshness: timedelta | None = None, ) -> Callable[[Type[T]], ModelContractWrapper[T]]: def decorator(cls: Type[T]) -> ModelContractWrapper[T]: @@ -190,6 +196,8 @@ def decorator(cls: Type[T]) -> ModelContractWrapper[T]: application_source=application_source, dataset_store=resolve_dataset_store(dataset_store) if dataset_store else None, exposed_at_url=exposed_at_url, + acceptable_freshness=acceptable_freshness, + unacceptable_freshness=unacceptable_freshness, ) return ModelContractWrapper(metadata, cls) @@ -225,6 +233,8 @@ class MyModel(ModelContract): classification_targets=set(), regression_targets=set(), recommendation_targets=set(), + acceptable_freshness=metadata.acceptable_freshness, + unacceptable_freshness=metadata.unacceptable_freshness, ) probability_features: dict[str, set[TargetProbability]] = {} diff --git a/aligned/data_file.py b/aligned/data_file.py index 264dd81..0f9b547 100644 --- a/aligned/data_file.py +++ b/aligned/data_file.py @@ -27,9 +27,12 @@ async def read_pandas(self) -> pd.DataFrame: async def to_pandas(self) -> pd.DataFrame: return await self.read_pandas() - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: raise NotImplementedError() + async def to_polars(self) -> pl.DataFrame: + return (await self.to_lazy_polars()).collect() + async def write_polars(self, df: pl.LazyFrame) -> None: raise NotImplementedError() diff --git a/aligned/feature_store.py b/aligned/feature_store.py index c645810..07a539f 100644 --- a/aligned/feature_store.py +++ b/aligned/feature_store.py @@ -676,9 +676,9 @@ async def insert_into( import polars as pl columns = write_request.all_returned_columns - new_df = (await values.to_polars()).select(columns) + new_df = (await values.to_lazy_polars()).select(columns) try: - existing_df = await source.to_polars() + existing_df = await source.to_lazy_polars() write_df = pl.concat([new_df, existing_df.select(columns)], how='vertical_relaxed') except UnableToFindFileException: write_df = new_df @@ -710,10 +710,10 @@ async def upsert_into( if isinstance(source, WritableFeatureSource): await source.upsert(values, [write_request]) elif isinstance(source, DataFileReference): - new_df = (await values.to_polars()).select(write_request.all_returned_columns) + new_df = (await values.to_lazy_polars()).select(write_request.all_returned_columns) entities = list(write_request.entity_names) try: - existing_df = await source.to_polars() + existing_df = await source.to_lazy_polars() write_df = upsert_on_column(entities, new_df, existing_df) except UnableToFindFileException: write_df = new_df diff --git a/aligned/feature_view/combined_view.py b/aligned/feature_view/combined_view.py index eb7fd63..a18ab93 100644 --- a/aligned/feature_view/combined_view.py +++ b/aligned/feature_view/combined_view.py @@ -67,7 +67,7 @@ class SomeView: return store.feature_view(self.metadata.name) async def process(self, data: dict[str, list[Any]]) -> list[dict]: - df = await self.query().process_input(data).to_polars() + df = await self.query().process_input(data).to_lazy_polars() return df.collect().to_dicts() diff --git a/aligned/feature_view/feature_view.py b/aligned/feature_view/feature_view.py index 016baab..234e42f 100644 --- a/aligned/feature_view/feature_view.py +++ b/aligned/feature_view/feature_view.py @@ -5,6 +5,7 @@ import polars as pl import pandas as pd +from datetime import timedelta from abc import ABC, abstractproperty from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any, TypeVar, Generic, Type, Callable @@ -58,6 +59,8 @@ class FeatureViewMetadata: materialized_source: BatchDataSource | None = field(default=None) contacts: list[str] | None = field(default=None) tags: dict[str, str] = field(default_factory=dict) + acceptable_freshness: timedelta | None = field(default=None) + unacceptable_freshness: timedelta | None = field(default=None) @staticmethod def from_compiled(view: CompiledFeatureView) -> FeatureViewMetadata: @@ -69,6 +72,8 @@ def from_compiled(view: CompiledFeatureView) -> FeatureViewMetadata: stream_source=view.stream_data_source, application_source=view.application_source, materialized_source=view.materialized_source, + acceptable_freshness=view.acceptable_freshness, + unacceptable_freshness=view.unacceptable_freshness, ) @@ -94,6 +99,8 @@ def feature_view( materialized_source: BatchDataSource | None = None, contacts: list[str] | None = None, tags: dict[str, str] | None = None, + acceptable_freshness: timedelta | None = None, + unacceptable_freshness: timedelta | None = None, ) -> Callable[[Type[T]], FeatureViewWrapper[T]]: def decorator(cls: Type[T]) -> FeatureViewWrapper[T]: @@ -106,6 +113,8 @@ def decorator(cls: Type[T]) -> FeatureViewWrapper[T]: materialized_source=materialized_source, contacts=contacts, tags=tags or {}, + acceptable_freshness=acceptable_freshness, + unacceptable_freshness=unacceptable_freshness, ) return FeatureViewWrapper(metadata, cls()) @@ -296,7 +305,7 @@ def process_input(self, data: ConvertableToRetrivalJob) -> RetrivalJob: return self.query().process_input(data) async def process(self, data: ConvertableToRetrivalJob) -> list[dict]: - df = await self.query().process_input(data).to_polars() + df = await self.query().process_input(data).to_lazy_polars() return df.collect().to_dicts() async def freshness_in_source(self, source: BatchDataSource) -> datetime | None: @@ -471,6 +480,8 @@ def compile_with_metadata(feature_view: Any, metadata: FeatureViewMetadata) -> C stream_data_source=metadata.stream_source, application_source=metadata.application_source, materialized_source=metadata.materialized_source, + acceptable_freshness=metadata.acceptable_freshness, + unacceptable_freshness=metadata.unacceptable_freshness, indexes=[], ) aggregations: list[FeatureFactory] = [] @@ -626,7 +637,7 @@ class SomeView(FeatureView): @classmethod async def process(cls, data: dict[str, list[Any]]) -> list[dict]: - df = await cls.query().process_input(data).to_polars() + df = await cls.query().process_input(data).to_lazy_polars() return df.collect().to_dicts() @staticmethod @@ -684,7 +695,7 @@ class MyView: @feature_view( name="{view_name}", description="some description", - source={batch_source_code} + source={batch_source_code}, stream_source=None, ) class MyView: diff --git a/aligned/feature_view/tests/test_joined_source.py b/aligned/feature_view/tests/test_joined_source.py index b23bd31..ad36ddd 100644 --- a/aligned/feature_view/tests/test_joined_source.py +++ b/aligned/feature_view/tests/test_joined_source.py @@ -78,10 +78,10 @@ async def test_join_different_types_polars() -> None: ) new_data = left_data.join(right_data, 'inner', left_on='some_id', right_on='some_id') - result = await new_data.to_polars() + result = await new_data.to_lazy_polars() joined = result.collect().sort('some_id', descending=False) - assert joined.frame_equal(expected_df.select(joined.columns)) + assert joined.equals(expected_df.select(joined.columns)) @pytest.mark.asyncio @@ -111,13 +111,13 @@ async def test_join_different_join_keys() -> None: new_data = left_data.join(right_data, 'inner', left_on='some_id', right_on='other_id') - result = await new_data.to_polars() + result = await new_data.to_lazy_polars() req_result = new_data.request_result joined = result.collect().sort('some_id', descending=False) - assert joined.frame_equal(expected_df.select(joined.columns)) - assert joined.select(req_result.entity_columns).frame_equal(expected_df.select(['some_id'])) + assert joined.equals(expected_df.select(joined.columns)) + assert joined.select(req_result.entity_columns).equals(expected_df.select(['some_id'])) @pytest.mark.asyncio @@ -136,10 +136,10 @@ async def test_unique_entities() -> None: }, ) - result = await data.unique_on(['some_id'], sort_key='feature').to_polars() + result = await data.unique_on(['some_id'], sort_key='feature').to_lazy_polars() sorted = result.sort('some_id').select(['some_id', 'feature']).collect() - assert sorted.frame_equal(expected_df) + assert sorted.equals(expected_df) @pytest.mark.asyncio diff --git a/aligned/jobs/tests/test_combined_job.py b/aligned/jobs/tests/test_combined_job.py index 981a30e..3d70b37 100644 --- a/aligned/jobs/tests/test_combined_job.py +++ b/aligned/jobs/tests/test_combined_job.py @@ -27,7 +27,7 @@ async def test_combined_polars( job = CombineFactualJob( jobs=[retrival_job, retrival_job_with_timestamp], combined_requests=[combined_retrival_request] ) - data = (await job.to_polars()).collect() + data = (await job.to_lazy_polars()).collect() assert set(data.columns) == {'id', 'a', 'b', 'c', 'd', 'created_at', 'c+d', 'a+c+d'} assert data.shape[0] == 5 diff --git a/aligned/jobs/tests/test_derived_job.py b/aligned/jobs/tests/test_derived_job.py index f86ef39..c889a83 100644 --- a/aligned/jobs/tests/test_derived_job.py +++ b/aligned/jobs/tests/test_derived_job.py @@ -128,7 +128,7 @@ def feature_store() -> FeatureStore: @pytest.mark.asyncio async def test_aggregate_over_derived() -> None: - data = await IncomeAgg.query().all().to_polars() + data = await IncomeAgg.query().all().to_lazy_polars() df = data.collect() @@ -142,7 +142,7 @@ async def test_aggregate_over_derived_fact() -> None: data = await store.features_for( entities={'user_id': ['a', 'b']}, features=['income_agg:total_amount'] - ).to_polars() + ).to_lazy_polars() df = data.collect() diff --git a/aligned/local/job.py b/aligned/local/job.py index f534081..d93d451 100644 --- a/aligned/local/job.py +++ b/aligned/local/job.py @@ -35,7 +35,7 @@ def request_result(self) -> RequestResult: async def to_pandas(self) -> pd.DataFrame: return self.df.collect().to_pandas() - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: return self.df @@ -217,10 +217,10 @@ async def file_transform_polars(self, df: pl.LazyFrame) -> pl.LazyFrame: return df async def to_pandas(self) -> pd.DataFrame: - return (await self.to_polars()).collect().to_pandas() + return (await self.to_lazy_polars()).collect().to_pandas() - async def to_polars(self) -> pl.LazyFrame: - file = await self.source.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + file = await self.source.to_lazy_polars() return await self.file_transform_polars(file) @@ -292,8 +292,8 @@ async def to_pandas(self) -> pd.DataFrame: file = await self.source.read_pandas() return self.file_transformations(file) - async def to_polars(self) -> pl.LazyFrame: - file = await self.source.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + file = await self.source.to_lazy_polars() return self.file_transform_polars(file) @@ -369,7 +369,7 @@ async def file_transformations(self, df: pl.LazyFrame) -> pl.LazyFrame: for request in self.requests: all_features.update(request.all_required_features) - result = await self.facts.to_polars() + result = await self.facts.to_lazy_polars() event_timestamp_col = 'aligned_event_timestamp' event_timestamp_entity_columns = [ @@ -489,10 +489,10 @@ async def file_transformations(self, df: pl.LazyFrame) -> pl.LazyFrame: return result.select([pl.exclude('row_id')]) async def to_pandas(self) -> pd.DataFrame: - return (await self.to_polars()).collect().to_pandas() + return (await self.to_lazy_polars()).collect().to_pandas() - async def to_polars(self) -> pl.LazyFrame: - return await self.file_transformations(await self.source.to_polars()) + async def to_lazy_polars(self) -> pl.LazyFrame: + return await self.file_transformations(await self.source.to_lazy_polars()) def log_each_job(self) -> RetrivalJob: from aligned.retrival_job import LogJob diff --git a/aligned/local/tests/test_jobs.py b/aligned/local/tests/test_jobs.py index ba5034c..f75d5bc 100644 --- a/aligned/local/tests/test_jobs.py +++ b/aligned/local/tests/test_jobs.py @@ -32,7 +32,7 @@ async def test_file_full_job_polars(retrival_request_without_derived: RetrivalRe } ) job = FileFullJob(source=LiteralReference(frame), request=retrival_request_without_derived) - data = (await job.to_polars()).collect() + data = (await job.to_lazy_polars()).collect() assert set(data.columns) == {'id', 'a', 'b'} assert data.shape[0] == 5 @@ -45,3 +45,4 @@ async def test_write_and_read_feature_store(titanic_feature_store_scd: FeatureSt await source.write(definition.to_json().encode('utf-8')) store = await source.feature_store() assert store is not None + assert store.model('titanic').model.predictions_view.acceptable_freshness is not None diff --git a/aligned/psql/jobs.py b/aligned/psql/jobs.py index 4155f68..1134b07 100644 --- a/aligned/psql/jobs.py +++ b/aligned/psql/jobs.py @@ -127,10 +127,10 @@ def will_load_list_feature(self) -> bool: return False async def to_pandas(self) -> pd.DataFrame: - df = await self.to_polars() + df = await self.to_lazy_polars() return df.collect().to_pandas() - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: try: return pl.read_database(self.query, self.config.url).lazy() except Exception as e: @@ -264,9 +264,9 @@ async def to_pandas(self) -> pd.DataFrame: job = await self.psql_job() return await job.to_pandas() - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: job = await self.psql_job() - return await job.to_polars() + return await job.to_lazy_polars() async def psql_job(self) -> PostgreSqlJob: if isinstance(self.facts, PostgreSqlJob): @@ -492,7 +492,7 @@ def aggregated_values_from_request(self, request: RetrivalRequest) -> list[Table return fetches async def build_request(self) -> str: - facts = await self.facts.to_polars() + facts = await self.facts.to_lazy_polars() return self.build_request_from_facts(facts) def build_request_from_facts(self, facts: pl.LazyFrame) -> str: diff --git a/aligned/redis/job.py b/aligned/redis/job.py index 1422995..592523d 100644 --- a/aligned/redis/job.py +++ b/aligned/redis/job.py @@ -25,16 +25,16 @@ def retrival_requests(self) -> list[RetrivalRequest]: return self.requests async def to_pandas(self) -> pd.DataFrame: - return (await self.to_polars()).collect().to_pandas() + return (await self.to_lazy_polars()).collect().to_pandas() def describe(self) -> str: features_to_load = [list(request.all_feature_names) for request in self.requests] return f'Loading features from Redis using HMGET {features_to_load}' - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: redis = self.config.redis() - result_df = (await self.facts.to_polars()).collect() + result_df = (await self.facts.to_lazy_polars()).collect() for request in self.requests: redis_combine_id = 'redis_combine_entity_id' diff --git a/aligned/redis/tests/test_redis_job.py b/aligned/redis/tests/test_redis_job.py index 0862e41..54966f2 100644 --- a/aligned/redis/tests/test_redis_job.py +++ b/aligned/redis/tests/test_redis_job.py @@ -149,6 +149,6 @@ async def test_write_job(mocker, retrival_request: RetrivalRequest) -> None: # await source.insert(insert_facts, [retrival_request]) job = FactualRedisJob(RedisConfig.localhost(), requests=[retrival_request], facts=facts) - data = await job.to_polars() + data = await job.to_lazy_polars() - assert data.collect().select('x').to_series().series_equal(pl.Series('x', [1, 2, 3, None])) + assert data.collect().select('x').to_series().equals(pl.Series('x', [1, 2, 3, None])) diff --git a/aligned/redshift/jobs.py b/aligned/redshift/jobs.py index e0d9398..11d49d9 100644 --- a/aligned/redshift/jobs.py +++ b/aligned/redshift/jobs.py @@ -63,9 +63,9 @@ async def to_pandas(self) -> pd.DataFrame: job = await self.psql_job() return await job.to_pandas() - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: job = await self.psql_job() - return await job.to_polars() + return await job.to_lazy_polars() async def psql_job(self) -> PostgreSqlJob: if isinstance(self.facts, PostgreSqlJob): diff --git a/aligned/redshift/sql_job.py b/aligned/redshift/sql_job.py index c3413ea..005b816 100644 --- a/aligned/redshift/sql_job.py +++ b/aligned/redshift/sql_job.py @@ -72,7 +72,7 @@ def select_table(table: TableFetch) -> str: from_table = 'FROM ' columns = [col.sql_select for col in table.columns] - select = f'SELECT {",".join(columns)}' + select = f'SELECT {', '.join(columns)}' if table.conditions: wheres = 'WHERE ' + ' AND '.join(table.conditions) @@ -169,10 +169,10 @@ def retrival_requests(self) -> list[RetrivalRequest]: return self.requests async def to_pandas(self) -> pd.DataFrame: - df = await self.to_polars() + df = await self.to_lazy_polars() return df.collect().to_pandas() - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: try: return pl.read_sql(self.query, self.config.url).lazy() except Exception as e: diff --git a/aligned/retrival_job.py b/aligned/retrival_job.py index 9a17ad0..5401643 100644 --- a/aligned/retrival_job.py +++ b/aligned/retrival_job.py @@ -253,9 +253,11 @@ async def update_metadata() -> None: await data_store.store_train_test_validate(test_metadata) return TrainTestValidateJob( - train_job=self.train_job.on_load(update_metadata).cached_at(train_source), - test_job=self.test_job.on_load(update_metadata).cached_at(test_source), - validate_job=self.validate_job.on_load(update_metadata).cached_at(validate_source), + train_job=self.train_job.cached_at(train_source).on_load(update_metadata).cached_at(train_source), + test_job=self.test_job.cached_at(test_source).on_load(update_metadata).cached_at(test_source), + validate_job=self.validate_job.cached_at(validate_source) + .on_load(update_metadata) + .cached_at(validate_source), target_columns=self.target_columns, ) @@ -283,8 +285,8 @@ async def to_pandas(self) -> SupervisedDataSet[pd.DataFrame]: data, entities, features, self.target_columns, self.job.request_result.event_timestamp ) - async def to_polars(self) -> SupervisedDataSet[pl.LazyFrame]: - data = await self.job.to_polars() + async def to_lazy_polars(self) -> SupervisedDataSet[pl.LazyFrame]: + data = await self.job.to_lazy_polars() if self.should_filter_out_null_targets: data = data.drop_nulls([column for column in self.target_columns]) @@ -375,7 +377,7 @@ class SupervisedTrainJob: train_size: float async def to_pandas(self) -> TrainTestSet[pd.DataFrame]: - core_data = await self.job.to_polars() + core_data = await self.job.to_lazy_polars() data = core_data.data.collect() data = data.to_pandas() @@ -393,7 +395,7 @@ async def to_pandas(self) -> TrainTestSet[pd.DataFrame]: async def to_polars(self) -> TrainTestSet[pl.DataFrame]: # Use the pandas method, as the split is not created for polars yet # A but unsure if I should use the same index concept for polars - core_data = await self.job.to_polars() + core_data = await self.job.to_lazy_polars() data = core_data.data.collect() @@ -470,9 +472,12 @@ async def to_pandas(self) -> pd.DataFrame: raise NotImplementedError(f'For {type(self)}') @abstractmethod - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: raise NotImplementedError(f'For {type(self)}') + async def to_polars(self) -> pl.DataFrame: + return await (await self.to_lazy_polars()).collect_async() + def describe(self) -> str: if isinstance(self, ModificationJob): return f'{self.job.describe()} -> {self.__class__.__name__}' @@ -734,7 +739,7 @@ async def write_to_source(self, source: WritableFeatureSource | DataFileReferenc from aligned.sources.local import DataFileReference if isinstance(source, DataFileReference): - await source.write_polars(await self.to_polars()) + await source.write_polars(await self.to_lazy_polars()) else: await source.insert(self, self.retrival_requests) @@ -757,12 +762,12 @@ class CustomPolarsJob(RetrivalJob, ModificationJob): job: RetrivalJob polars_method: Callable[[pl.LazyFrame], pl.LazyFrame] - async def to_polars(self) -> pl.LazyFrame: - df = await self.job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + df = await self.job.to_lazy_polars() return self.polars_method(df) async def to_pandas(self) -> pd.DataFrame: - df = await self.job.to_polars() + df = await self.job.to_lazy_polars() return df.collect().to_pandas() @@ -778,8 +783,8 @@ class SubsetJob(RetrivalJob, ModificationJob): def fraction(self) -> float: return self.end_ratio - self.start_ratio - async def to_polars(self) -> pl.LazyFrame: - data = (await self.job.to_polars()).collect() + async def to_lazy_polars(self) -> pl.LazyFrame: + data = (await self.job.to_lazy_polars()).collect() return subset_polars(data, self.start_ratio, self.end_ratio, self.sort_column).lazy() async def to_pandas(self) -> pd.DataFrame: @@ -799,8 +804,8 @@ async def to_pandas(self) -> pd.DataFrame: await self.on_load() return data - async def to_polars(self) -> pl.LazyFrame: - data = (await self.job.to_polars()).collect() + async def to_lazy_polars(self) -> pl.LazyFrame: + data = (await self.job.to_lazy_polars()).collect() await self.on_load() return data.lazy() @@ -815,12 +820,12 @@ class EncodeDatesJob(RetrivalJob, ModificationJob): formatter: DateFormatter columns: list[str] - async def to_polars(self) -> pl.LazyFrame: - data = await self.job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + data = await self.job.to_lazy_polars() return data.with_columns([self.formatter.encode_polars(column) for column in self.columns]) async def to_pandas(self) -> pd.DataFrame: - return (await self.to_polars()).collect().to_pandas() + return (await self.to_lazy_polars()).collect().to_pandas() @dataclass @@ -829,11 +834,11 @@ class InMemoryCacheJob(RetrivalJob, ModificationJob): job: RetrivalJob cached_data: pl.DataFrame | None = None - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: if self.cached_data is not None: return self.cached_data.lazy() - data = (await self.job.to_polars()).collect() + data = (await self.job.to_lazy_polars()).collect() self.cached_data = data return data.lazy() @@ -860,10 +865,10 @@ def request_result(self) -> RequestResult: def retrival_requests(self) -> list[RetrivalRequest]: return [self.agg_request] - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: from aligned.local.job import aggregate - core_frame = await self.job.to_polars() + core_frame = await self.job.to_lazy_polars() existing_cols = set(core_frame.columns) agg_features = {agg.name for agg in self.agg_request.aggregated_features} @@ -877,7 +882,7 @@ async def to_polars(self) -> pl.LazyFrame: return await aggregate(self.agg_request, core_frame) async def to_pandas(self) -> pd.DataFrame: - return (await self.to_polars()).collect().to_pandas() + return (await self.to_lazy_polars()).collect().to_pandas() def describe(self) -> str: return f'Aggregating over {self.job.describe()}' @@ -905,9 +910,9 @@ def request_result(self) -> RequestResult: def retrival_requests(self) -> list[RetrivalRequest]: return RetrivalRequest.combine(self.left_job.retrival_requests + self.right_job.retrival_requests) - async def to_polars(self) -> pl.LazyFrame: - left = await self.left_job.to_polars() - right = await self.right_job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + left = await self.left_job.to_lazy_polars() + right = await self.right_job.to_lazy_polars() return left.with_columns( pl.col(self.left_event_timestamp).dt.cast_time_unit(self.timestamp_unit), @@ -931,7 +936,7 @@ def log_each_job(self) -> RetrivalJob: return LogJob(sub_log) async def to_pandas(self) -> pd.DataFrame: - return (await self.to_polars()).collect().to_pandas() + return (await self.to_lazy_polars()).collect().to_pandas() def describe(self) -> str: return ( @@ -969,9 +974,9 @@ def request_result(self) -> RequestResult: def retrival_requests(self) -> list[RetrivalRequest]: return RetrivalRequest.combine(self.left_job.retrival_requests + self.right_job.retrival_requests) - async def to_polars(self) -> pl.LazyFrame: - left = await self.left_job.to_polars() - right = await self.right_job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + left = await self.left_job.to_lazy_polars() + right = await self.right_job.to_lazy_polars() return_request = self.left_job.request_result @@ -1027,8 +1032,8 @@ class FilteredJob(RetrivalJob, ModificationJob): job: RetrivalJob condition: DerivedFeature | Feature | str - async def to_polars(self) -> pl.LazyFrame: - df = await self.job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + df = await self.job.to_lazy_polars() if isinstance(self.condition, str): col = pl.col(self.condition) @@ -1078,8 +1083,8 @@ async def to_pandas(self) -> pd.DataFrame: df = await self.job.to_pandas() return df.rename(self.mappings) - async def to_polars(self) -> pl.LazyFrame: - df = await self.job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + df = await self.job.to_lazy_polars() return df.rename(self.mappings) @@ -1092,8 +1097,8 @@ class DropDuplicateEntities(RetrivalJob, ModificationJob): def entity_columns(self) -> list[str]: return self.job.request_result.entity_columns - async def to_polars(self) -> pl.LazyFrame: - df = await self.job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + df = await self.job.to_lazy_polars() return df.unique(subset=self.entity_columns) async def to_pandas(self) -> pd.DataFrame: @@ -1118,8 +1123,8 @@ def retrival_requests(self) -> list[RetrivalRequest]: async def to_pandas(self) -> pd.DataFrame: raise NotImplementedError() - async def to_polars(self) -> pl.LazyFrame: - data = await self.job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + data = await self.job.to_lazy_polars() update_jobs = [] for index in self.indexes: @@ -1155,7 +1160,7 @@ def retrival_requests(self) -> list[RetrivalRequest]: async def to_pandas(self) -> pd.DataFrame: return pd.DataFrame(self.data) - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: return pl.DataFrame(self.data).lazy() def describe(self) -> str: @@ -1191,13 +1196,13 @@ async def to_pandas(self) -> pd.DataFrame: logging.debug(df) return df - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: if logger.level == 0: logging.basicConfig(level=logging.DEBUG) job_name = self.retrival_requests[0].name logging.debug(f'Starting to run {type(self.job).__name__} - {job_name}') try: - df = await self.job.to_polars() + df = await self.job.to_lazy_polars() except Exception as error: logging.debug(f'Failed in job: {type(self.job).__name__} - {job_name}') raise error @@ -1237,9 +1242,9 @@ async def to_pandas(self) -> pd.DataFrame: list(DropInvalidJob.features_to_validate(self.retrival_requests)), await self.job.to_pandas() ) - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: return self.validator.validate_polars( - list(DropInvalidJob.features_to_validate(self.retrival_requests)), await self.job.to_polars() + list(DropInvalidJob.features_to_validate(self.retrival_requests)), await self.job.to_lazy_polars() ) def with_subfeatures(self) -> RetrivalJob: @@ -1328,8 +1333,8 @@ async def compute_derived_features_pandas(self, df: pd.DataFrame) -> pd.DataFram async def to_pandas(self) -> pd.DataFrame: return await self.compute_derived_features_pandas(await self.job.to_pandas()) - async def to_polars(self) -> pl.LazyFrame: - return await self.compute_derived_features_polars(await self.job.to_polars()) + async def to_lazy_polars(self) -> pl.LazyFrame: + return await self.compute_derived_features_polars(await self.job.to_lazy_polars()) def remove_derived_features(self) -> RetrivalJob: return self.job.remove_derived_features() @@ -1343,10 +1348,10 @@ class UniqueRowsJob(RetrivalJob, ModificationJob): sort_key: str | None = field(default=None) async def to_pandas(self) -> pd.DataFrame: - return (await self.to_polars()).collect().to_pandas() + return (await self.to_lazy_polars()).collect().to_pandas() - async def to_polars(self) -> pl.LazyFrame: - data = await self.job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + data = await self.job.to_lazy_polars() if self.sort_key: data = data.sort(self.sort_key, descending=True) @@ -1368,8 +1373,8 @@ async def to_pandas(self) -> pd.DataFrame: return data - async def to_polars(self) -> pl.LazyFrame: - data = await self.job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + data = await self.job.to_lazy_polars() for request in self.retrival_requests: if request.entity_names - set(data.columns): @@ -1401,8 +1406,8 @@ async def to_pandas(self) -> pd.DataFrame: data[feature] = None return data - async def to_polars(self) -> pl.LazyFrame: - data = await self.job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + data = await self.job.to_lazy_polars() for request in self.retrival_requests: missing = request.all_required_feature_names - set(data.columns) @@ -1453,7 +1458,7 @@ async def data_windows(self, window: AggregateOver, data: pl.DataFrame, now: dat raise ValueError('Condition is not supported for stream aggregation, yet') try: - window_data = (await checkpoint.to_polars()).collect() + window_data = (await checkpoint.to_lazy_polars()).collect() if filter_expr is not None: new_data = pl.concat( @@ -1477,8 +1482,8 @@ async def data_windows(self, window: AggregateOver, data: pl.DataFrame, now: dat async def to_pandas(self) -> pd.DataFrame: raise NotImplementedError() - async def to_polars(self) -> pl.LazyFrame: - data = (await self.job.to_polars()).collect() + async def to_lazy_polars(self) -> pl.LazyFrame: + data = (await self.job.to_lazy_polars()).collect() # This is used as a dummy frame, as the pl abstraction is not good enough lazy_df = pl.DataFrame({}).lazy() @@ -1530,7 +1535,7 @@ async def to_polars(self) -> AsyncIterator[pl.LazyFrame]: needed_requests = self.job.retrival_requests without_derived = self.job.remove_derived_features() - raw_files = (await without_derived.to_polars()).collect() + raw_files = (await without_derived.to_lazy_polars()).collect() features_to_include = self.job.request_result.features.union(self.job.request_result.entities) features_to_include_names = {feature.name for feature in features_to_include} @@ -1546,7 +1551,7 @@ async def to_polars(self) -> AsyncIterator[pl.LazyFrame]: .select_columns(features_to_include_names) ) - chunked_df = await chunked_job.to_polars() + chunked_df = await chunked_job.to_lazy_polars() yield chunked_df async def to_pandas(self) -> AsyncIterator[pd.DataFrame]: @@ -1586,8 +1591,8 @@ async def to_pandas(self) -> pd.DataFrame: .to_pandas() ) - async def to_polars(self) -> pl.LazyFrame: - return await self.job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + return await self.job.to_lazy_polars() def cached_at(self, location: DataFileReference | str) -> RetrivalJob: return self @@ -1621,18 +1626,18 @@ async def to_pandas(self) -> pd.DataFrame: await self.location.write_pandas(df) return df - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: try: logger.debug('Trying to read cache file') - df = await self.location.to_polars() + df = await self.location.to_lazy_polars() except UnableToFindFileException: logger.debug('Unable to load file, so fetching from source') - df = await self.job.to_polars() + df = await self.job.to_lazy_polars() logger.debug('Writing result to cache') await self.location.write_polars(df) except FileNotFoundError: logger.debug('Unable to load file, so fetching from source') - df = await self.job.to_polars() + df = await self.job.to_lazy_polars() logger.debug('Writing result to cache') await self.location.write_polars(df) return df @@ -1684,8 +1689,8 @@ def retrival_requests(self) -> list[RetrivalRequest]: async def to_pandas(self) -> pd.DataFrame: return await self.job.to_pandas() - async def to_polars(self) -> pl.LazyFrame: - return await self.job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + return await self.job.to_lazy_polars() @dataclass @@ -1707,9 +1712,9 @@ async def to_pandas(self) -> pd.DataFrame: self.time_metric.observe(elapsed) return df - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: start_time = timeit.default_timer() - df = await self.job.to_polars() + df = await self.job.to_lazy_polars() concrete = df.collect() elapsed = timeit.default_timer() - start_time logger.debug(f'Computed records in {elapsed} seconds') @@ -1778,8 +1783,8 @@ async def to_pandas(self) -> pd.DataFrame: df[feature.name] = pd.to_datetime(df[feature.name], infer_datetime_format=True, utc=True) return df - async def to_polars(self) -> pl.LazyFrame: - df = await self.job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + df = await self.job.to_lazy_polars() org_schema = dict(df.schema) for request in self.requests: features_to_check = request.all_required_features @@ -1915,9 +1920,9 @@ async def combine_polars_data(self, df: pl.LazyFrame) -> pl.LazyFrame: return df async def to_pandas(self) -> pd.DataFrame: - return (await self.to_polars()).collect().to_pandas() + return (await self.to_lazy_polars()).collect().to_pandas() - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: if not self.jobs: raise ValueError( 'Have no jobs to fetch. This is probably an internal error.\n' @@ -1925,7 +1930,7 @@ async def to_polars(self) -> pl.LazyFrame: 'Or maybe even submit a PR' ) - dfs: list[pl.LazyFrame] = await asyncio.gather(*[job.to_polars() for job in self.jobs]) + dfs: list[pl.LazyFrame] = await asyncio.gather(*[job.to_lazy_polars() for job in self.jobs]) results = [job.request_result for job in self.jobs] joined_entities = set(results[0].entity_columns) @@ -1989,8 +1994,8 @@ async def to_pandas(self) -> pd.DataFrame: else: return df - async def to_polars(self) -> pl.LazyFrame: - df = await self.job.to_polars() + async def to_lazy_polars(self) -> pl.LazyFrame: + df = await self.job.to_lazy_polars() if self.include_features: total_list = list({ent.name for ent in self.request_result.entities}.union(self.include_features)) return df.select(total_list) @@ -2037,10 +2042,10 @@ async def to_pandas(self) -> pd.DataFrame: await asyncio.gather(*[trigger.check_pandas(df, self.request_result) for trigger in self.triggers]) return df - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: import asyncio - df = await self.job.to_polars() + df = await self.job.to_lazy_polars() await asyncio.gather(*[trigger.check_polars(df, self.request_result) for trigger in self.triggers]) return df diff --git a/aligned/schemas/feature_view.py b/aligned/schemas/feature_view.py index 0d48687..0b00db5 100644 --- a/aligned/schemas/feature_view.py +++ b/aligned/schemas/feature_view.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from datetime import datetime +from datetime import datetime, timedelta from dataclasses import dataclass, field @@ -18,7 +18,6 @@ if TYPE_CHECKING: from aligned.retrival_job import RetrivalJob - @dataclass class CompiledFeatureView(Codable): name: str @@ -36,6 +35,9 @@ class CompiledFeatureView(Codable): application_source: BatchDataSource | None = field(default=None) materialized_source: BatchDataSource | None = field(default=None) + acceptable_freshness: timedelta | None = field(default=None) + unacceptable_freshness: timedelta | None = field(default=None) + event_triggers: set[EventTrigger] | None = field(default=None) contacts: list[str] | None = field(default=None) diff --git a/aligned/schemas/model.py b/aligned/schemas/model.py index 6246e02..42c1e2b 100644 --- a/aligned/schemas/model.py +++ b/aligned/schemas/model.py @@ -1,5 +1,6 @@ import logging from dataclasses import dataclass, field +from datetime import timedelta from aligned.request.retrival_request import FeatureRequest, RetrivalRequest from aligned.schemas.codable import Codable @@ -59,6 +60,9 @@ class PredictionsView(Codable): classification_targets: set[ClassificationTarget] | None = field(default=None) recommendation_targets: set[RecommendationTarget] | None = field(default=None) + acceptable_freshness: timedelta | None = field(default=None) + unacceptable_freshness: timedelta | None = field(default=None) + @property def full_schema(self) -> set[Feature]: diff --git a/aligned/schemas/transformation.py b/aligned/schemas/transformation.py index d9039e6..47f78a0 100644 --- a/aligned/schemas/transformation.py +++ b/aligned/schemas/transformation.py @@ -117,7 +117,7 @@ async def run_transformation_test_polars(cls) -> None: expected = test.output_polars if test.transformation.dtype == FeatureType.bool(): - is_correct = output.series_equal(test.output_polars.alias(alias)) + is_correct = output.equals(test.output_polars.alias(alias)) assert is_correct, ( f'Output for {cls.__name__} is not correct.,' f'\nGot: {output},\nexpected: {test.output_polars}' diff --git a/aligned/server.py b/aligned/server.py index 9c38550..da06fbf 100644 --- a/aligned/server.py +++ b/aligned/server.py @@ -159,7 +159,7 @@ async def get_model(entity_values: dict) -> str: for value in entity_values['event_timestamp'] ] - df = await feature_store.model(name).features_for(entity_values).to_polars() + df = await feature_store.model(name).features_for(entity_values).to_lazy_polars() pandas_df = df.collect().to_pandas() orient = 'values' body = ','.join( @@ -233,7 +233,9 @@ async def root() -> RedirectResponse: async def features(payload: APIFeatureRequest) -> dict: import json - df = await feature_store.features_for(payload.entities, features=payload.features).to_polars() + df = await feature_store.features_for( + payload.entities, features=payload.features + ).to_lazy_polars() json_data = json.dumps(df.collect().to_dict(as_series=False)) return Response(content=json_data, media_type='application/json') diff --git a/aligned/source_validation.py b/aligned/source_validation.py index cd43b3e..b5b0cfc 100644 --- a/aligned/source_validation.py +++ b/aligned/source_validation.py @@ -25,7 +25,7 @@ async def validate_sources_in(views: list[SourceRequest]) -> dict[FeatureLocatio for view in views: try: - _ = (await view.source.all_data(view.request, limit=1).to_polars()).collect() + _ = (await view.source.all_data(view.request, limit=1).to_lazy_polars()).collect() results[view.location] = True except Exception: results[view.location] = False diff --git a/aligned/sources/local.py b/aligned/sources/local.py index d3545f7..d7dc7ef 100644 --- a/aligned/sources/local.py +++ b/aligned/sources/local.py @@ -62,7 +62,7 @@ async def as_repo_definition(self) -> RepoDefinition: async def data_file_freshness(reference: DataFileReference, column_name: str) -> datetime | None: try: - file = await reference.to_polars() + file = await reference.to_lazy_polars() if isinstance(reference, ColumnFeatureMappable): source_column = reference.feature_identifier_for([column_name])[0] else: @@ -121,7 +121,7 @@ async def read_pandas(self) -> pd.DataFrame: except HTTPStatusError: raise UnableToFindFileException(self.path) - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: if self.path.startswith('http'): from io import BytesIO @@ -203,7 +203,7 @@ def multi_source_features_for( ) async def schema(self) -> dict[str, FeatureFactory]: - df = await self.to_polars() + df = await self.to_lazy_polars() return {name: FeatureType.from_polars(pl_type).feature_factory for name, pl_type in df.schema.items()} async def feature_view_code(self, view_name: str) -> str: @@ -268,7 +268,7 @@ async def write_pandas(self, df: pd.DataFrame) -> None: index=self.config.should_write_index, ) - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: if not do_file_exist(self.path): raise UnableToFindFileException(self.path) @@ -348,12 +348,12 @@ def __hash__(self) -> int: return hash(self.job_group_key()) async def read_pandas(self) -> pd.DataFrame: - return (await self.to_polars()).collect().to_pandas() + return (await self.to_lazy_polars()).collect().to_pandas() async def write_pandas(self, df: pd.DataFrame) -> None: await self.write_polars(pl.from_pandas(df).lazy()) - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: if not do_file_exist(self.path): raise UnableToFindFileException(self.path) @@ -389,7 +389,7 @@ async def insert(self, job: RetrivalJob, requests: list[RetrivalRequest]) -> Non request = requests[0] - data = await job.to_polars() + data = await job.to_lazy_polars() data.select(request.all_returned_columns).collect().write_delta(self.path, mode='append') async def upsert(self, job: RetrivalJob, requests: list[RetrivalRequest]) -> None: @@ -398,8 +398,8 @@ async def upsert(self, job: RetrivalJob, requests: list[RetrivalRequest]) -> Non request = requests[0] - new_data = await job.to_polars() - existing = await self.to_polars() + new_data = await job.to_lazy_polars() + existing = await self.to_lazy_polars() upsert_on_column(list(request.entity_names), new_data, existing).collect().write_delta( self.path, mode='overwrite' @@ -546,5 +546,5 @@ def job_group_key(self) -> str: async def read_pandas(self) -> pd.DataFrame: return self.file.collect().to_pandas() - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: return self.file diff --git a/aligned/sources/psql.py b/aligned/sources/psql.py index 214e0d9..df6b1a6 100644 --- a/aligned/sources/psql.py +++ b/aligned/sources/psql.py @@ -175,7 +175,7 @@ async def insert(self, job: RetrivalJob, requests: list[RetrivalRequest]) -> Non request = requests[0] - data = await job.to_polars() + data = await job.to_lazy_polars() data.select(request.all_returned_columns).collect().write_database( self.table, connection=self.config.url, if_exists='append' ) diff --git a/aligned/sources/redis.py b/aligned/sources/redis.py index f50f2a6..3df23e4 100644 --- a/aligned/sources/redis.py +++ b/aligned/sources/redis.py @@ -173,7 +173,7 @@ def features_for(self, facts: RetrivalJob, request: FeatureRequest) -> RetrivalJ async def insert(self, job: RetrivalJob, requests: list[RetrivalRequest]) -> None: redis = self.config.redis() - data = await job.to_polars() + data = await job.to_lazy_polars() async with redis.pipeline(transaction=True) as pipe: @@ -274,7 +274,7 @@ def make_redis_friendly(self, data: pl.LazyFrame, features: set[Feature]) -> pl. async def write_to_stream(self, job: RetrivalJob) -> None: redis = self.config.redis() - df = await job.to_polars() + df = await job.to_lazy_polars() df = self.make_redis_friendly(df, job.request_result.features.union(job.request_result.entities)) values = df.collect() diff --git a/aligned/sources/s3.py b/aligned/sources/s3.py index 6607491..ba079f3 100644 --- a/aligned/sources/s3.py +++ b/aligned/sources/s3.py @@ -189,7 +189,7 @@ async def read_pandas(self) -> pd.DataFrame: except HTTPStatusError: raise UnableToFindFileException() - async def to_polars(self) -> pl.LazyFrame: + async def to_lazy_polars(self) -> pl.LazyFrame: try: data = await self.storage.read(self.path) buffer = BytesIO(data) diff --git a/aligned/sources/tests/test_parquet.py b/aligned/sources/tests/test_parquet.py index 852f303..84b6af9 100644 --- a/aligned/sources/tests/test_parquet.py +++ b/aligned/sources/tests/test_parquet.py @@ -26,9 +26,9 @@ async def test_read_parquet(point_in_time_data_test: DataTest) -> None: ) store.add_feature_view(view) - stored = await store.feature_view(view.metadata.name).all().to_polars() + stored = await store.feature_view(view.metadata.name).all().to_lazy_polars() df = stored.select(source.data.columns).collect() - assert df.frame_equal(source.data) + assert df.equals(source.data) @pytest.mark.asyncio @@ -55,7 +55,7 @@ async def test_parquest(point_in_time_data_test: DataTest) -> None: point_in_time_data_test.feature_reference, event_timestamp_column='event_timestamp', ) - data = (await job.to_polars()).collect() + data = (await job.to_lazy_polars()).collect() expected = point_in_time_data_test.expected_output @@ -63,7 +63,7 @@ async def test_parquest(point_in_time_data_test: DataTest) -> None: assert set(expected.columns) == set(data.columns), f'Expected: {expected.columns}\nGot: {data.columns}' ordered_columns = data.select(expected.columns) - assert ordered_columns.frame_equal(expected), f'Expected: {expected}\nGot: {ordered_columns}' + assert ordered_columns.equals(expected), f'Expected: {expected}\nGot: {ordered_columns}' @pytest.mark.asyncio @@ -91,7 +91,7 @@ async def test_parquet_without_event_timestamp( point_in_time_data_test_wituout_event_timestamp.entities, point_in_time_data_test_wituout_event_timestamp.feature_reference, ) - data = (await job.to_polars()).collect() + data = (await job.to_lazy_polars()).collect() expected = point_in_time_data_test_wituout_event_timestamp.expected_output @@ -99,4 +99,4 @@ async def test_parquet_without_event_timestamp( assert set(expected.columns) == set(data.columns), f'Expected: {expected.columns}\nGot: {data.columns}' ordered_columns = data.select(expected.columns) - assert ordered_columns.frame_equal(expected), f'Expected: {expected}\nGot: {ordered_columns}' + assert ordered_columns.equals(expected), f'Expected: {expected}\nGot: {ordered_columns}' diff --git a/aligned/sources/tests/test_psql.py b/aligned/sources/tests/test_psql.py index 8ba5d94..df89ed1 100644 --- a/aligned/sources/tests/test_psql.py +++ b/aligned/sources/tests/test_psql.py @@ -43,7 +43,7 @@ async def test_postgresql(point_in_time_data_test: DataTest, psql: PostgreSQLCon point_in_time_data_test.feature_reference, event_timestamp_column='event_timestamp', ) - data = (await job.to_polars()).collect() + data = (await job.to_lazy_polars()).collect() expected = point_in_time_data_test.expected_output @@ -51,7 +51,7 @@ async def test_postgresql(point_in_time_data_test: DataTest, psql: PostgreSQLCon assert set(expected.columns) == set(data.columns), f'Expected: {expected.columns}\nGot: {data.columns}' ordered_columns = data.select(expected.columns) - assert ordered_columns.frame_equal(expected), f'Expected: {expected}\nGot: {ordered_columns}' + assert ordered_columns.equals(expected), f'Expected: {expected}\nGot: {ordered_columns}' @pytest.mark.skipif( @@ -69,7 +69,7 @@ async def test_postgresql_write(titanic_feature_store: FeatureStore, psql: Postg store = titanic_feature_store.model('titanic').using_source(source) await store.insert_predictions(data) - stored_data = await psql.fetch('SELECT * FROM titanic').to_polars() + stored_data = await psql.fetch('SELECT * FROM titanic').to_lazy_polars() assert_frame_equal( pl.DataFrame(data), stored_data.collect(), @@ -78,7 +78,7 @@ async def test_postgresql_write(titanic_feature_store: FeatureStore, psql: Postg check_dtype=False, ) - preds = await store.predictions_for({'passenger_id': [1, 3, 2, 4]}).to_polars() + preds = await store.predictions_for({'passenger_id': [1, 3, 2, 4]}).to_lazy_polars() assert_frame_equal( pl.DataFrame(data), preds.collect(), @@ -116,7 +116,7 @@ async def test_postgresql_without_event( point_in_time_data_test_wituout_event_timestamp.entities, point_in_time_data_test_wituout_event_timestamp.feature_reference, ) - data = (await job.to_polars()).collect() + data = (await job.to_lazy_polars()).collect() expected = point_in_time_data_test_wituout_event_timestamp.expected_output @@ -124,4 +124,4 @@ async def test_postgresql_without_event( assert set(expected.columns) == set(data.columns), f'Expected: {expected.columns}\nGot: {data.columns}' ordered_columns = data.select(expected.columns) - assert ordered_columns.frame_equal(expected), f'Expected: {expected}\nGot: {ordered_columns}' + assert ordered_columns.equals(expected), f'Expected: {expected}\nGot: {ordered_columns}' diff --git a/aligned/split_strategy.py b/aligned/split_strategy.py index a0c1852..9c41473 100644 --- a/aligned/split_strategy.py +++ b/aligned/split_strategy.py @@ -47,29 +47,29 @@ def sorted_features(self) -> list[str]: def train(self) -> DatasetType: if isinstance(self.data, pl.DataFrame): return self.data[self.train_index.to_list(), :] - return self.data.iloc[self.train_index] + return self.data.iloc[self.train_index] # type: ignore @property def train_input(self) -> DatasetType: - return self.train[self.sorted_features] + return self.train[self.sorted_features] # type: ignore @property def train_target(self) -> DatasetType: - return self.train[list(self.target_columns)] + return self.train[list(self.target_columns)] # type: ignore @property def test(self) -> DatasetType: if isinstance(self.data, pl.DataFrame): return self.data[self.test_index.to_list(), :] - return self.data.iloc[self.test_index] + return self.data.iloc[self.test_index] # type: ignore @property def test_input(self) -> DatasetType: - return self.test[self.sorted_features] + return self.test[self.sorted_features] # type: ignore @property def test_target(self) -> DatasetType: - return self.test[list(self.target_columns)] + return self.test[list(self.target_columns)] # type: ignore class SupervisedDataSet(Generic[DatasetType]): @@ -102,19 +102,19 @@ def __init__( def entities(self) -> DatasetType: if isinstance(self.data, (pl.LazyFrame, pl.DataFrame)): return self.data.select(list(self.entity_columns)) - return self.data[list(self.entity_columns)] + return self.data[list(self.entity_columns)] # type: ignore @property def input(self) -> DatasetType: if isinstance(self.data, (pl.LazyFrame, pl.DataFrame)): return self.data.select(self.sorted_features) - return self.data[self.sorted_features] + return self.data[self.sorted_features] # type: ignore @property def labels(self) -> DatasetType: if isinstance(self.data, (pl.LazyFrame, pl.DataFrame)): return self.data.select(list(self.target_columns)) - return self.data[list(self.target_columns)] + return self.data[list(self.target_columns)] # type: ignore class TrainTestValidateSet(Generic[DatasetType]): @@ -158,22 +158,22 @@ def sorted_features(self) -> list[str]: def input(self) -> DatasetType: if isinstance(self.data, pl.LazyFrame): return self.data.select(self.sorted_features) - return self.data[self.sorted_features] + return self.data[self.sorted_features] # type: ignore @property def labels(self) -> DatasetType: if isinstance(self.data, pl.LazyFrame): return self.data.select(sorted(self.target_columns)) - return self.data[sorted(self.target_columns)] + return self.data[sorted(self.target_columns)] # type: ignore @property def train(self) -> SupervisedDataSet[DatasetType]: if isinstance(self.data, pl.DataFrame): data = self.data[self.train_index.to_list(), :] else: - data = self.data.iloc[self.train_index] + data = self.data.iloc[self.train_index] # type: ignore - return SupervisedDataSet( + return SupervisedDataSet( # type: ignore data, self.entity_columns, self.feature_columns, @@ -195,9 +195,9 @@ def test(self) -> SupervisedDataSet[DatasetType]: if isinstance(self.data, pl.DataFrame): data = self.data[self.test_index.to_list(), :] else: - data = self.data.iloc[self.test_index] + data = self.data.iloc[self.test_index] # type: ignore - return SupervisedDataSet( + return SupervisedDataSet( # type: ignore data, set(self.entity_columns), set(self.feature_columns), @@ -218,8 +218,8 @@ def validate(self) -> SupervisedDataSet[DatasetType]: if isinstance(self.data, pl.DataFrame): data = self.data[self.validate_index.to_list(), :] else: - data = self.data.iloc[self.validate_index] - return SupervisedDataSet( + data = self.data.iloc[self.validate_index] # type: ignore + return SupervisedDataSet( # type: ignore data, self.entity_columns, set(self.feature_columns), @@ -311,7 +311,7 @@ def split(data: DataFrame, start_ratio: float, end_ratio: float) -> DataFrame: [develop, split(sub_group, self.train_size_percentage + self.test_size_percentage, 1)], axis=0 ) - return SplitDataSet( + return SplitDataSet( # type: ignore train_input=train.drop(columns=[target_column]), train_target=train[target_column], develop_input=develop.drop(columns=[target_column]), diff --git a/aligned/tests/test_model_target.py b/aligned/tests/test_model_target.py index 4ccf122..17bbe47 100644 --- a/aligned/tests/test_model_target.py +++ b/aligned/tests/test_model_target.py @@ -56,15 +56,15 @@ async def test_titanic_model_with_targets_and_scd(titanic_feature_store_scd: Fea await titanic_feature_store_scd.model('titanic') .with_labels() .features_for(entities.to_dict(as_series=False), event_timestamp_column='event_timestamp') - .to_polars() + .to_lazy_polars() ) input_df = dataset.input.collect() target_df = dataset.labels.collect() - assert target_df['survived'].series_equal(expected_data['survived']) - assert input_df['is_male'].series_equal(expected_data['is_male']) - assert input_df['age'].series_equal(expected_data['age']) + assert target_df['survived'].equals(expected_data['survived']) + assert input_df['is_male'].equals(expected_data['is_male']) + assert input_df['age'].equals(expected_data['age']) @pytest.mark.asyncio @@ -125,7 +125,7 @@ class TestModel: await store.insert_into(FeatureLocation.model('test_model'), {'id': [1, 2, 3], 'a': [10, 14, 20]}) stored_data = pl.read_parquet(path).select(expected_frame.columns) - assert stored_data.frame_equal(expected_frame) + assert stored_data.equals(expected_frame) @pytest.mark.asyncio @@ -157,4 +157,4 @@ class TestModel: columns = set(stored_data.columns).difference(expected_frame.columns) assert len(columns) == 0 - assert stored_data.select(expected_frame.columns).frame_equal(expected_frame) + assert stored_data.select(expected_frame.columns).equals(expected_frame) diff --git a/aligned/tests/test_transformations.py b/aligned/tests/test_transformations.py index d2894fb..9819b3b 100644 --- a/aligned/tests/test_transformations.py +++ b/aligned/tests/test_transformations.py @@ -112,10 +112,12 @@ class TestAgg: org_values_job = TestAgg.query().using_source(TestAgg.metadata.source).all() # type: ignore await org_values_job.write_to_source(materialized_source) - values = await org_values_job.to_polars() - df = await TestAgg.query().all().to_polars() # type: ignore + values = await org_values_job.to_lazy_polars() + descrete_values = await org_values_job.to_polars() + df = await TestAgg.query().all().to_lazy_polars() # type: ignore - assert df.sort('dob_ssn').collect().frame_equal(values.sort('dob_ssn').select(df.columns).collect()) + assert df.sort('dob_ssn').collect().equals(values.sort('dob_ssn').select(df.columns).collect()) + assert descrete_values.sort('dob_ssn').equals(values.sort('dob_ssn').select(descrete_values.columns).collect()) @pytest.mark.asyncio diff --git a/aligned/worker.py b/aligned/worker.py index 7a098d0..03b5dcd 100644 --- a/aligned/worker.py +++ b/aligned/worker.py @@ -248,7 +248,7 @@ async def process_predictions( active_learning_config.selection, active_learning_config.write_policy, ) - _ = await job.to_polars() + _ = await job.to_lazy_polars() logger.debug( f'Processing {len(records)} predictions in {timeit.default_timer() - start_time} seconds' diff --git a/conftest.py b/conftest.py index 4126732..7b22d34 100644 --- a/conftest.py +++ b/conftest.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from datetime import timedelta from math import ceil, floor import polars as pl @@ -632,6 +633,8 @@ def titanic_model_scd(titanic_feature_view_scd: FeatureView) -> ModelContractWra 'titanic', description='A model predicting if a passenger will survive', features=[features.age, features.sibsp, features.has_siblings, features.is_male], # type: ignore + acceptable_freshness=timedelta(days=1), + unacceptable_freshness=timedelta(days=2), ) class Titanic: diff --git a/setup.cfg b/setup.cfg index 5a556ea..5833453 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,7 +6,7 @@ inline-quotes = 'double' # W503 line break before binary operator - not Black/PEP8 compatible # SIM106 handle error cases first # TC002 Move third-party import into a type-checking block (not compatible with pydantic) -ignore = E203, W503, SIM106, TC002, SIM110, TC001 +ignore = E203, W503, SIM106, TC002, SIM110, TC001, E231, E201, E202, E241 enable-extensions = TC, TC1 pytest-mark-no-parentheses=true pytest-fixture-no-parentheses=true diff --git a/test_data/credit_history_mater.parquet b/test_data/credit_history_mater.parquet index 02e451b..af9c9a5 100644 Binary files a/test_data/credit_history_mater.parquet and b/test_data/credit_history_mater.parquet differ diff --git a/test_data/feature-store.json b/test_data/feature-store.json index b05e14b..72064a8 100644 --- a/test_data/feature-store.json +++ b/test_data/feature-store.json @@ -1 +1 @@ -{"metadata": {"created_at": "2024-02-03T20:34:56.568052", "name": "feature_store_location.py", "repo_url": null, "github_url": null}, "feature_views": [{"name": "titanic_parquet", "tags": {}, "source": {"mapping_keys": {}, "type_name": "parquet", "path": "test_data/titanic.parquet", "config": {"engine": "auto", "compression": "snappy", "should_write_index": false}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}], "derived_features": [{"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": null, "stream_data_source": null, "application_source": null, "materialized_source": null, "event_triggers": null, "contacts": null, "indexes": []}, {"name": "titanic", "tags": {}, "source": {"mapping_keys": {"PassengerId": "passenger_id", "Age": "age", "Sex": "sex", "Survived": "survived", "SibSp": "sibsp", "UpdatedAt": "updated_at"}, "type_name": "csv", "path": "test_data/titanic_scd_data.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}]}, {"name": "updated_at", "dtype": {"name": "datetime"}, "description": null, "tags": null, "constraints": null}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}], "derived_features": [{"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "word_vectoriser", "dtype": {"name": "embedding"}, "key": "name", "model": {"name": "gensim", "model_name": "glove-wiki-gigaword-50", "config": {"to_lowercase": false, "deaccent": false, "encoding": "utf8", "errors": "strict"}, "loaded_model": null}}, "depth": 1}, {"name": "square_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul", "dtype": {"name": "float"}, "front": "sibsp", "behind": "sibsp"}, "depth": 1}, {"name": "double_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul_val", "dtype": {"name": "float"}, "key": "sibsp", "value": {"name": "int", "value": 2}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": {"name": "updated_at", "ttl": null, "description": null, "tags": null, "dtype": {"name": "datetime"}}, "stream_data_source": {"mapping_keys": {}, "name": "redis", "topic_name": "titanic_stream", "config": {"env_var": "REDIS_URL"}, "record_coder": {"coder_type": "json", "key": "json"}}, "application_source": null, "materialized_source": null, "event_triggers": null, "contacts": null, "indexes": [{"location": {"name": "titanic", "location": "feature_view"}, "vector": {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null}, "vector_dim": 50, "metadata": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "storage": {"type_name": "redis", "config": {"env_var": "REDIS_URL"}, "name": "name_embedding_index", "initial_cap": 10000, "distance_metric": "COSINE", "index_alogrithm": "FLAT", "embedding_type": "FLOAT32"}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}]}]}], "combined_feature_views": [], "models": [{"name": "titanic", "features": {"default_version": "default", "versions": {"default": [{"name": "age", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "float"}}, {"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}, {"name": "has_siblings", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, {"name": "is_male", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}]}}, "predictions_view": {"entities": [], "features": [{"name": "probability", "dtype": {"name": "float"}, "description": "The probability of target named will_survive being 'True'.", "tags": null, "constraints": null}], "derived_features": [{"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "probability", "location": {"name": "titanic", "location": "model"}, "dtype": {"name": "float"}}], "transformation": {"name": "map_arg_max", "dtype": {"name": "bool"}, "column_mappings": {"probability": {"name": "bool", "value": true}}}, "depth": 1}], "model_version_column": null, "event_timestamp": null, "source": null, "application_source": null, "stream_source": null, "regression_targets": [], "classification_targets": [{"estimating": {"name": "survived", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, "feature": {"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null}, "on_ground_truth_event": null, "event_trigger": null, "class_probabilities": [{"outcome": {"name": "bool", "value": true}, "feature": {"name": "probability", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null}}], "confidence": null}], "recommendation_targets": []}, "description": "A model predicting if a passenger will survive", "contacts": null, "tags": null, "dataset_store": null, "exposed_at_url": null}], "enrichers": []} +{"metadata": {"created_at": "2024-02-10T12:37:50.762541", "name": "feature_store_location.py", "repo_url": null, "github_url": null}, "feature_views": [{"name": "titanic", "tags": {}, "source": {"mapping_keys": {"PassengerId": "passenger_id", "Age": "age", "Sex": "sex", "Survived": "survived", "SibSp": "sibsp", "UpdatedAt": "updated_at"}, "type_name": "csv", "path": "test_data/titanic_scd_data.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "updated_at", "dtype": {"name": "datetime"}, "description": null, "tags": null, "constraints": null}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}], "derived_features": [{"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "double_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul_val", "dtype": {"name": "float"}, "key": "sibsp", "value": {"name": "int", "value": 2}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "square_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul", "dtype": {"name": "float"}, "front": "sibsp", "behind": "sibsp"}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "word_vectoriser", "dtype": {"name": "embedding"}, "key": "name", "model": {"name": "gensim", "model_name": "glove-wiki-gigaword-50", "config": {"to_lowercase": false, "deaccent": false, "encoding": "utf8", "errors": "strict"}, "loaded_model": null}}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": {"name": "updated_at", "ttl": null, "description": null, "tags": null, "dtype": {"name": "datetime"}}, "stream_data_source": {"mapping_keys": {}, "name": "redis", "topic_name": "titanic_stream", "config": {"env_var": "REDIS_URL"}, "record_coder": {"coder_type": "json", "key": "json"}}, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": [{"location": {"name": "titanic", "location": "feature_view"}, "vector": {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null}, "vector_dim": 50, "metadata": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "storage": {"type_name": "redis", "config": {"env_var": "REDIS_URL"}, "name": "name_embedding_index", "initial_cap": 10000, "distance_metric": "COSINE", "index_alogrithm": "FLAT", "embedding_type": "FLOAT32"}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}]}]}, {"name": "titanic_parquet", "tags": {}, "source": {"mapping_keys": {}, "type_name": "parquet", "path": "test_data/titanic.parquet", "config": {"engine": "auto", "compression": "snappy", "should_write_index": false}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}], "derived_features": [{"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}], "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": null, "stream_data_source": null, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": []}], "combined_feature_views": [], "models": [{"name": "titanic", "features": {"default_version": "default", "versions": {"default": [{"name": "age", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "float"}}, {"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}, {"name": "has_siblings", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, {"name": "is_male", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}]}}, "predictions_view": {"entities": [], "features": [{"name": "probability", "dtype": {"name": "float"}, "description": "The probability of target named will_survive being 'True'.", "tags": null, "constraints": null}], "derived_features": [{"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "probability", "location": {"name": "titanic", "location": "model"}, "dtype": {"name": "float"}}], "transformation": {"name": "map_arg_max", "dtype": {"name": "bool"}, "column_mappings": {"probability": {"name": "bool", "value": true}}}, "depth": 1}], "model_version_column": null, "event_timestamp": null, "source": null, "application_source": null, "stream_source": null, "regression_targets": [], "classification_targets": [{"estimating": {"name": "survived", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, "feature": {"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null}, "on_ground_truth_event": null, "event_trigger": null, "class_probabilities": [{"outcome": {"name": "bool", "value": true}, "feature": {"name": "probability", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null}}], "confidence": null}], "recommendation_targets": [], "acceptable_freshness": 86400.0, "unacceptable_freshness": 172800.0}, "description": "A model predicting if a passenger will survive", "contacts": null, "tags": null, "dataset_store": null, "exposed_at_url": null}], "enrichers": []} \ No newline at end of file diff --git a/test_data/test_model.parquet b/test_data/test_model.parquet index 78c31e3..537db00 100644 Binary files a/test_data/test_model.parquet and b/test_data/test_model.parquet differ diff --git a/test_data/titanic-sets.json b/test_data/titanic-sets.json index 32c55a9..4f0c23f 100644 --- a/test_data/titanic-sets.json +++ b/test_data/titanic-sets.json @@ -1 +1 @@ -{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}, {"name": "optional"}]}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}, {"name": "optional"}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []} +{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}, {"name": "optional"}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}, {"name": "optional"}]}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []} \ No newline at end of file diff --git a/test_data/titanic-test.csv b/test_data/titanic-test.csv index 741ec4d..d7321c4 100644 --- a/test_data/titanic-test.csv +++ b/test_data/titanic-test.csv @@ -1,21 +1,21 @@ -is_mr,passenger_id,age,cabin,is_female,sibsp,is_male,survived,has_siblings,sex,name -True,61,22.0,,False,0,True,False,False,male,"Sirayanian, Mr. Orsen" -False,62,38.0,B28,True,0,False,True,False,female,"Icard, Miss. Amelie" -True,63,45.0,C83,False,1,True,False,True,male,"Harris, Mr. Henry Birkhardt" -False,64,4.0,,False,3,True,False,True,male,"Skoog, Master. Harald" -True,65,,,False,0,True,False,False,male,"Stewart, Mr. Albert A" -False,66,,,False,1,True,True,True,male,"Moubarek, Master. Gerios" -True,67,29.0,F33,True,0,False,True,False,female,"Nye, Mrs. (Elizabeth Ramell)" -True,68,19.0,,False,0,True,False,False,male,"Crease, Mr. Ernest James" -False,69,17.0,,True,4,False,True,True,female,"Andersson, Miss. Erna Alexandra" -True,70,26.0,,False,2,True,False,True,male,"Kink, Mr. Vincenz" -True,71,32.0,,False,0,True,False,False,male,"Jenkin, Mr. Stephen Curnow" -False,72,16.0,,True,5,False,False,True,female,"Goodwin, Miss. Lillian Amy" -True,73,21.0,,False,0,True,False,False,male,"Hood, Mr. Ambrose Jr" -True,74,26.0,,False,1,True,False,True,male,"Chronopoulos, Mr. Apostolos" -True,75,32.0,,False,0,True,True,False,male,"Bing, Mr. Lee" -True,76,25.0,F G73,False,0,True,False,False,male,"Moen, Mr. Sigurd Hansen" -True,77,,,False,0,True,False,False,male,"Staneff, Mr. Ivan" -True,78,,,False,0,True,False,False,male,"Moutal, Mr. Rahamin Haim" -False,79,0.83,,False,0,True,True,False,male,"Caldwell, Master. Alden Gates" -False,80,30.0,,True,0,False,True,False,female,"Dowdell, Miss. Elizabeth" +age,sex,name,is_mr,passenger_id,is_female,cabin,sibsp,has_siblings,survived,is_male +22.0,male,"Sirayanian, Mr. Orsen",True,61,False,,0,False,False,True +38.0,female,"Icard, Miss. Amelie",False,62,True,B28,0,False,True,False +45.0,male,"Harris, Mr. Henry Birkhardt",True,63,False,C83,1,True,False,True +4.0,male,"Skoog, Master. Harald",False,64,False,,3,True,False,True +,male,"Stewart, Mr. Albert A",True,65,False,,0,False,False,True +,male,"Moubarek, Master. Gerios",False,66,False,,1,True,True,True +29.0,female,"Nye, Mrs. (Elizabeth Ramell)",True,67,True,F33,0,False,True,False +19.0,male,"Crease, Mr. Ernest James",True,68,False,,0,False,False,True +17.0,female,"Andersson, Miss. Erna Alexandra",False,69,True,,4,True,True,False +26.0,male,"Kink, Mr. Vincenz",True,70,False,,2,True,False,True +32.0,male,"Jenkin, Mr. Stephen Curnow",True,71,False,,0,False,False,True +16.0,female,"Goodwin, Miss. Lillian Amy",False,72,True,,5,True,False,False +21.0,male,"Hood, Mr. Ambrose Jr",True,73,False,,0,False,False,True +26.0,male,"Chronopoulos, Mr. Apostolos",True,74,False,,1,True,False,True +32.0,male,"Bing, Mr. Lee",True,75,False,,0,False,True,True +25.0,male,"Moen, Mr. Sigurd Hansen",True,76,False,F G73,0,False,False,True +,male,"Staneff, Mr. Ivan",True,77,False,,0,False,False,True +,male,"Moutal, Mr. Rahamin Haim",True,78,False,,0,False,False,True +0.83,male,"Caldwell, Master. Alden Gates",False,79,False,,0,False,True,True +30.0,female,"Dowdell, Miss. Elizabeth",False,80,True,,0,False,True,False diff --git a/test_data/titanic-train.csv b/test_data/titanic-train.csv index 925bdb7..04997e7 100644 --- a/test_data/titanic-train.csv +++ b/test_data/titanic-train.csv @@ -1,61 +1,61 @@ -is_mr,passenger_id,age,cabin,is_female,sibsp,is_male,survived,has_siblings,sex,name -True,1,22.0,,False,1,True,False,True,male,"Braund, Mr. Owen Harris" -True,2,38.0,C85,True,1,False,True,True,female,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)" -False,3,26.0,,True,0,False,True,False,female,"Heikkinen, Miss. Laina" -True,4,35.0,C123,True,1,False,True,True,female,"Futrelle, Mrs. Jacques Heath (Lily May Peel)" -True,5,35.0,,False,0,True,False,False,male,"Allen, Mr. William Henry" -True,6,,,False,0,True,False,False,male,"Moran, Mr. James" -True,7,54.0,E46,False,0,False,False,False,other,"McCarthy, Mr. Timothy J" -False,8,2.0,,False,3,True,False,True,male,"Palsson, Master. Gosta Leonard" -True,9,27.0,,True,0,False,True,False,female,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)" -True,10,14.0,,True,1,False,True,True,female,"Nasser, Mrs. Nicholas (Adele Achem)" -False,11,4.0,G6,True,1,False,True,True,female,"Sandstrom, Miss. Marguerite Rut" -False,12,58.0,C103,True,0,False,True,False,female,"Bonnell, Miss. Elizabeth" -True,13,20.0,,False,0,True,False,False,male,"Saundercock, Mr. William Henry" -True,14,39.0,,False,1,True,False,True,male,"Andersson, Mr. Anders Johan" -False,15,14.0,,True,0,False,False,False,female,"Vestrom, Miss. Hulda Amanda Adolfina" -True,16,55.0,,True,0,False,True,False,female,"Hewlett, Mrs. (Mary D Kingcome) " -False,17,2.0,,False,4,True,False,True,male,"Rice, Master. Eugene" -True,18,,,False,0,True,True,False,male,"Williams, Mr. Charles Eugene" -True,19,31.0,,True,1,False,False,True,female,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)" -True,20,,,True,0,False,True,False,female,"Masselmani, Mrs. Fatima" -True,21,35.0,,False,0,True,False,False,male,"Fynney, Mr. Joseph J" -True,22,34.0,D56,False,0,True,True,False,male,"Beesley, Mr. Lawrence" -False,23,15.0,,True,0,False,True,False,female,"McGowan, Miss. Anna ""Annie""" -True,24,28.0,A6,False,0,True,True,False,male,"Sloper, Mr. William Thompson" -False,25,8.0,,True,3,False,False,True,female,"Palsson, Miss. Torborg Danira" -True,26,38.0,,True,1,False,True,True,female,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)" -True,27,,,False,0,True,False,False,male,"Emir, Mr. Farred Chehab" -True,28,19.0,C23 C25 C27,False,3,True,False,True,male,"Fortune, Mr. Charles Alexander" -False,29,,,True,0,False,True,False,female,"O'Dwyer, Miss. Ellen ""Nellie""" -True,30,,,False,0,True,False,False,male,"Todoroff, Mr. Lalio" -False,31,40.0,,False,0,True,False,False,male,"Uruchurtu, Don. Manuel E" -True,32,,B78,True,1,False,True,True,female,"Spencer, Mrs. William Augustus (Marie Eugenie)" -False,33,,,True,0,False,True,False,female,"Glynn, Miss. Mary Agatha" -True,34,66.0,,False,0,True,False,False,male,"Wheadon, Mr. Edward H" -True,35,28.0,,False,1,True,False,True,male,"Meyer, Mr. Edgar Joseph" -True,36,42.0,,False,1,True,False,True,male,"Holverson, Mr. Alexander Oskar" -True,37,,,False,0,True,True,False,male,"Mamee, Mr. Hanna" -True,38,21.0,,False,0,True,False,False,male,"Cann, Mr. Ernest Charles" -False,39,18.0,,True,2,False,False,True,female,"Vander Planke, Miss. Augusta Maria" -False,40,14.0,,True,1,False,True,True,female,"Nicola-Yarred, Miss. Jamila" -True,41,40.0,,True,1,False,False,True,female,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)" -True,42,27.0,,True,1,False,False,True,female,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)" -True,43,,,False,0,True,False,False,male,"Kraeff, Mr. Theodor" -False,44,3.0,,True,1,False,True,True,female,"Laroche, Miss. Simonne Marie Anne Andree" -False,45,19.0,,True,0,False,True,False,female,"Devaney, Miss. Margaret Delia" -True,46,,,False,0,True,False,False,male,"Rogers, Mr. William John" -True,47,,,False,1,True,False,True,male,"Lennon, Mr. Denis" -False,48,,,True,0,False,True,False,female,"O'Driscoll, Miss. Bridget" -True,49,,,False,2,True,False,True,male,"Samaan, Mr. Youssef" -True,50,18.0,,True,1,False,False,True,female,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)" -False,51,7.0,,False,4,True,False,True,male,"Panula, Master. Juha Niilo" -True,52,21.0,,False,0,True,False,False,male,"Nosworthy, Mr. Richard Cater" -True,53,49.0,D33,True,1,False,True,True,female,"Harper, Mrs. Henry Sleeper (Myna Haxtun)" -True,54,29.0,,True,1,False,True,True,female,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)" -True,55,65.0,B30,False,0,True,False,False,male,"Ostby, Mr. Engelhart Cornelius" -True,56,,C52,False,0,True,True,False,male,"Woolner, Mr. Hugh" -False,57,21.0,,True,0,False,True,False,female,"Rugg, Miss. Emily" -True,58,28.5,,False,0,True,False,False,male,"Novel, Mr. Mansouer" -False,59,5.0,,True,1,False,True,True,female,"West, Miss. Constance Mirium" -False,60,11.0,,False,5,True,False,True,male,"Goodwin, Master. William Frederick" +age,sex,name,is_mr,passenger_id,is_female,cabin,sibsp,has_siblings,survived,is_male +22.0,male,"Braund, Mr. Owen Harris",True,1,False,,1,True,False,True +38.0,female,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",True,2,True,C85,1,True,True,False +26.0,female,"Heikkinen, Miss. Laina",False,3,True,,0,False,True,False +35.0,female,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",True,4,True,C123,1,True,True,False +35.0,male,"Allen, Mr. William Henry",True,5,False,,0,False,False,True +,male,"Moran, Mr. James",True,6,False,,0,False,False,True +54.0,other,"McCarthy, Mr. Timothy J",True,7,False,E46,0,False,False,False +2.0,male,"Palsson, Master. Gosta Leonard",False,8,False,,3,True,False,True +27.0,female,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",True,9,True,,0,False,True,False +14.0,female,"Nasser, Mrs. Nicholas (Adele Achem)",True,10,True,,1,True,True,False +4.0,female,"Sandstrom, Miss. Marguerite Rut",False,11,True,G6,1,True,True,False +58.0,female,"Bonnell, Miss. Elizabeth",False,12,True,C103,0,False,True,False +20.0,male,"Saundercock, Mr. William Henry",True,13,False,,0,False,False,True +39.0,male,"Andersson, Mr. Anders Johan",True,14,False,,1,True,False,True +14.0,female,"Vestrom, Miss. Hulda Amanda Adolfina",False,15,True,,0,False,False,False +55.0,female,"Hewlett, Mrs. (Mary D Kingcome) ",True,16,True,,0,False,True,False +2.0,male,"Rice, Master. Eugene",False,17,False,,4,True,False,True +,male,"Williams, Mr. Charles Eugene",True,18,False,,0,False,True,True +31.0,female,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",True,19,True,,1,True,False,False +,female,"Masselmani, Mrs. Fatima",True,20,True,,0,False,True,False +35.0,male,"Fynney, Mr. Joseph J",True,21,False,,0,False,False,True +34.0,male,"Beesley, Mr. Lawrence",True,22,False,D56,0,False,True,True +15.0,female,"McGowan, Miss. Anna ""Annie""",False,23,True,,0,False,True,False +28.0,male,"Sloper, Mr. William Thompson",True,24,False,A6,0,False,True,True +8.0,female,"Palsson, Miss. Torborg Danira",False,25,True,,3,True,False,False +38.0,female,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",True,26,True,,1,True,True,False +,male,"Emir, Mr. Farred Chehab",True,27,False,,0,False,False,True +19.0,male,"Fortune, Mr. Charles Alexander",True,28,False,C23 C25 C27,3,True,False,True +,female,"O'Dwyer, Miss. Ellen ""Nellie""",False,29,True,,0,False,True,False +,male,"Todoroff, Mr. Lalio",True,30,False,,0,False,False,True +40.0,male,"Uruchurtu, Don. Manuel E",False,31,False,,0,False,False,True +,female,"Spencer, Mrs. William Augustus (Marie Eugenie)",True,32,True,B78,1,True,True,False +,female,"Glynn, Miss. Mary Agatha",False,33,True,,0,False,True,False +66.0,male,"Wheadon, Mr. Edward H",True,34,False,,0,False,False,True +28.0,male,"Meyer, Mr. Edgar Joseph",True,35,False,,1,True,False,True +42.0,male,"Holverson, Mr. Alexander Oskar",True,36,False,,1,True,False,True +,male,"Mamee, Mr. Hanna",True,37,False,,0,False,True,True +21.0,male,"Cann, Mr. Ernest Charles",True,38,False,,0,False,False,True +18.0,female,"Vander Planke, Miss. Augusta Maria",False,39,True,,2,True,False,False +14.0,female,"Nicola-Yarred, Miss. Jamila",False,40,True,,1,True,True,False +40.0,female,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",True,41,True,,1,True,False,False +27.0,female,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",True,42,True,,1,True,False,False +,male,"Kraeff, Mr. Theodor",True,43,False,,0,False,False,True +3.0,female,"Laroche, Miss. Simonne Marie Anne Andree",False,44,True,,1,True,True,False +19.0,female,"Devaney, Miss. Margaret Delia",False,45,True,,0,False,True,False +,male,"Rogers, Mr. William John",True,46,False,,0,False,False,True +,male,"Lennon, Mr. Denis",True,47,False,,1,True,False,True +,female,"O'Driscoll, Miss. Bridget",False,48,True,,0,False,True,False +,male,"Samaan, Mr. Youssef",True,49,False,,2,True,False,True +18.0,female,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",True,50,True,,1,True,False,False +7.0,male,"Panula, Master. Juha Niilo",False,51,False,,4,True,False,True +21.0,male,"Nosworthy, Mr. Richard Cater",True,52,False,,0,False,False,True +49.0,female,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",True,53,True,D33,1,True,True,False +29.0,female,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",True,54,True,,1,True,True,False +65.0,male,"Ostby, Mr. Engelhart Cornelius",True,55,False,B30,0,False,False,True +,male,"Woolner, Mr. Hugh",True,56,False,C52,0,False,True,True +21.0,female,"Rugg, Miss. Emily",False,57,True,,0,False,True,False +28.5,male,"Novel, Mr. Mansouer",True,58,False,,0,False,False,True +5.0,female,"West, Miss. Constance Mirium",False,59,True,,1,True,True,False +11.0,male,"Goodwin, Master. William Frederick",False,60,False,,5,True,False,True diff --git a/test_data/titanic-validate.csv b/test_data/titanic-validate.csv index eea2faf..ebcb646 100644 --- a/test_data/titanic-validate.csv +++ b/test_data/titanic-validate.csv @@ -1,21 +1,21 @@ -is_mr,passenger_id,age,cabin,is_female,sibsp,is_male,survived,has_siblings,sex,name -True,81,22.0,,False,0,True,False,False,male,"Waelens, Mr. Achille" -True,82,29.0,,False,0,True,True,False,male,"Sheerlinck, Mr. Jan Baptist" -False,83,,,True,0,False,True,False,female,"McDermott, Miss. Brigdet Delia" -True,84,28.0,,False,0,True,False,False,male,"Carrau, Mr. Francisco M" -False,85,17.0,,True,0,False,True,False,female,"Ilett, Miss. Bertha" -True,86,33.0,,True,3,False,True,True,female,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)" -True,87,16.0,,False,1,True,False,True,male,"Ford, Mr. William Neal" -True,88,,,False,0,True,False,False,male,"Slocovski, Mr. Selman Francis" -False,89,23.0,C23 C25 C27,True,3,False,True,True,female,"Fortune, Miss. Mabel Helen" -True,90,24.0,,False,0,True,False,False,male,"Celotti, Mr. Francesco" -True,91,29.0,,False,0,True,False,False,male,"Christmann, Mr. Emil" -True,92,20.0,,False,0,True,False,False,male,"Andreasson, Mr. Paul Edvin" -True,93,46.0,E31,False,1,True,False,True,male,"Chaffee, Mr. Herbert Fuller" -True,94,26.0,,False,1,True,False,True,male,"Dean, Mr. Bertram Frank" -True,95,59.0,,False,0,True,False,False,male,"Coxon, Mr. Daniel" -True,96,,,False,0,True,False,False,male,"Shorney, Mr. Charles Joseph" -True,97,71.0,A5,False,0,True,False,False,male,"Goldschmidt, Mr. George B" -True,98,23.0,D10 D12,False,0,True,True,False,male,"Greenfield, Mr. William Bertram" -True,99,34.0,,True,0,False,True,False,female,"Doling, Mrs. John T (Ada Julia Bone)" -True,100,34.0,,False,1,True,False,True,male,"Kantor, Mr. Sinai" +age,sex,name,is_mr,passenger_id,is_female,cabin,sibsp,has_siblings,survived,is_male +22.0,male,"Waelens, Mr. Achille",True,81,False,,0,False,False,True +29.0,male,"Sheerlinck, Mr. Jan Baptist",True,82,False,,0,False,True,True +,female,"McDermott, Miss. Brigdet Delia",False,83,True,,0,False,True,False +28.0,male,"Carrau, Mr. Francisco M",True,84,False,,0,False,False,True +17.0,female,"Ilett, Miss. Bertha",False,85,True,,0,False,True,False +33.0,female,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",True,86,True,,3,True,True,False +16.0,male,"Ford, Mr. William Neal",True,87,False,,1,True,False,True +,male,"Slocovski, Mr. Selman Francis",True,88,False,,0,False,False,True +23.0,female,"Fortune, Miss. Mabel Helen",False,89,True,C23 C25 C27,3,True,True,False +24.0,male,"Celotti, Mr. Francesco",True,90,False,,0,False,False,True +29.0,male,"Christmann, Mr. Emil",True,91,False,,0,False,False,True +20.0,male,"Andreasson, Mr. Paul Edvin",True,92,False,,0,False,False,True +46.0,male,"Chaffee, Mr. Herbert Fuller",True,93,False,E31,1,True,False,True +26.0,male,"Dean, Mr. Bertram Frank",True,94,False,,1,True,False,True +59.0,male,"Coxon, Mr. Daniel",True,95,False,,0,False,False,True +,male,"Shorney, Mr. Charles Joseph",True,96,False,,0,False,False,True +71.0,male,"Goldschmidt, Mr. George B",True,97,False,A5,0,False,False,True +23.0,male,"Greenfield, Mr. William Bertram",True,98,False,D10 D12,0,False,True,True +34.0,female,"Doling, Mrs. John T (Ada Julia Bone)",True,99,True,,0,False,True,False +34.0,male,"Kantor, Mr. Sinai",True,100,False,,1,True,False,True