Skip to content

Commit

Permalink
Minor bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
MatsMoll committed Mar 3, 2024
1 parent 70e5530 commit 0aa5364
Show file tree
Hide file tree
Showing 9 changed files with 127 additions and 119 deletions.
3 changes: 2 additions & 1 deletion aligned/compiler/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from aligned.schemas.derivied_feature import DerivedFeature
from aligned.schemas.feature import Feature, FeatureLocation, FeatureReferance, FeatureType
from aligned.schemas.feature_view import CompiledFeatureView
from aligned.schemas.folder import DatasetStore, JsonDatasetStore
from aligned.schemas.literal_value import LiteralValue
from aligned.schemas.model import Model as ModelSchema
from aligned.schemas.model import FeatureInputVersions as FeatureVersionSchema
Expand All @@ -36,6 +35,7 @@

if TYPE_CHECKING:
from aligned.sources.local import StorageFileReference
from aligned.schemas.folder import DatasetStore

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -199,6 +199,7 @@ def join_asof(self, view: FeatureViewWrapper, on_left: list[str], on_right: list


def resolve_dataset_store(dataset_store: DatasetStore | StorageFileReference) -> DatasetStore:
from aligned.schemas.folder import DatasetStore, JsonDatasetStore

if isinstance(dataset_store, DatasetStore):
return dataset_store
Expand Down
10 changes: 3 additions & 7 deletions aligned/schemas/folder.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,18 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from dataclasses import dataclass, field

from mashumaro.types import SerializableType
from aligned.data_source.batch_data_source import BatchDataSource
from aligned.request.retrival_request import RequestResult

from aligned.sources.local import StorageFileSource
from aligned.schemas.codable import Codable

if TYPE_CHECKING:
from aligned.sources.local import StorageFileReference


class DatasetStorageFactory:

supported_stores: dict[str, type[DatasetStore]]
supported_stores: dict[str, type[DatasetStore]] = dict()

_shared: DatasetStorageFactory | None = None

Expand Down Expand Up @@ -130,7 +126,7 @@ async def delete_metadata_for(self, dataset_id: str) -> DatasetMetadata | None:
@dataclass
class JsonDatasetStore(DatasetStore):

source: StorageFileReference
source: StorageFileSource
name = 'json'

async def list_datasets(self) -> GroupedDatasetList:
Expand Down
18 changes: 13 additions & 5 deletions aligned/sources/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@
from aligned.s3.storage import FileStorage, HttpStorage
from aligned.schemas.codable import Codable
from aligned.schemas.feature import EventTimestamp, FeatureType
from aligned.schemas.repo_definition import RepoDefinition
from aligned.storage import Storage
from aligned.feature_store import FeatureStore
from aligned.feature_source import WritableFeatureSource
from aligned.schemas.date_formatter import DateFormatter

if TYPE_CHECKING:
from aligned.compiler.feature_factory import FeatureFactory
from datetime import datetime
from aligned.schemas.repo_definition import RepoDefinition
from aligned.feature_store import FeatureStore


logger = logging.getLogger(__name__)
Expand All @@ -39,6 +39,8 @@ async def as_repo_definition(self) -> RepoDefinition:
raise NotImplementedError()

async def feature_store(self) -> FeatureStore:
from aligned.feature_store import FeatureStore

return FeatureStore.from_definition(await self.as_repo_definition())


Expand All @@ -56,6 +58,8 @@ async def write(self, content: bytes) -> None:
raise NotImplementedError(type(self))

async def as_repo_definition(self) -> RepoDefinition:
from aligned.schemas.repo_definition import RepoDefinition

file = await self.read()
return RepoDefinition.from_json(file)

Expand Down Expand Up @@ -270,7 +274,7 @@ async def write_pandas(self, df: pd.DataFrame) -> None:

async def to_lazy_polars(self) -> pl.LazyFrame:

if not do_file_exist(self.path):
if (not self.path.startswith('http')) and (not do_file_exist(self.path)):
raise UnableToFindFileException(self.path)

try:
Expand Down Expand Up @@ -307,7 +311,11 @@ def multi_source_features_for(
)

async def schema(self) -> dict[str, FeatureFactory]:
parquet_schema = pl.read_parquet_schema(self.path)
if self.path.startswith('http'):
parquet_schema = pl.scan_parquet(self.path).schema
else:
parquet_schema = pl.read_parquet_schema(self.path)

return {
name: FeatureType.from_polars(pl_type).feature_factory for name, pl_type in parquet_schema.items()
}
Expand Down Expand Up @@ -407,7 +415,7 @@ async def upsert(self, job: RetrivalJob, requests: list[RetrivalRequest]) -> Non


@dataclass
class StorageFileSource(StorageFileReference):
class StorageFileSource(StorageFileReference, Codable):

path: str

Expand Down
5 changes: 4 additions & 1 deletion aligned/tests/test_train_test_validate_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,10 @@ async def test_train_test_validate_set_new(titanic_feature_store: FeatureStore)
test = await dataset.test.to_pandas()
validate = await dataset.validate.to_pandas()

datasets = await JsonDatasetStore(dataset_store).list_datasets()
store = JsonDatasetStore(dataset_store)
datasets = await store.list_datasets()

assert store.to_json() != None

assert len(datasets.train_test_validation) == 1
train_dataset = datasets.train_test_validation[0]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "aligned"
version = "0.0.73"
version = "0.0.74"
description = "A data managment and lineage tool for ML applications."
authors = ["Mats E. Mollestad <mats@mollestad.no>"]
license = "Apache-2.0"
Expand Down
2 changes: 1 addition & 1 deletion test_data/titanic-sets.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}, {"name": "in_domain", "values": ["male", "female"]}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}, {"name": "optional"}]}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []}
{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "optional"}, {"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}, {"name": "in_domain", "values": ["male", "female"]}]}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []}
42 changes: 21 additions & 21 deletions test_data/titanic-test.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
has_siblings,name,sex,cabin,sibsp,age,is_female,is_male,is_mr,survived,passenger_id
False,"Sirayanian, Mr. Orsen",male,,0,22.0,False,True,True,False,61
False,"Icard, Miss. Amelie",female,B28,0,38.0,True,False,False,True,62
True,"Harris, Mr. Henry Birkhardt",male,C83,1,45.0,False,True,True,False,63
True,"Skoog, Master. Harald",male,,3,4.0,False,True,False,False,64
False,"Stewart, Mr. Albert A",male,,0,,False,True,True,False,65
True,"Moubarek, Master. Gerios",male,,1,,False,True,False,True,66
False,"Nye, Mrs. (Elizabeth Ramell)",female,F33,0,29.0,True,False,True,True,67
False,"Crease, Mr. Ernest James",male,,0,19.0,False,True,True,False,68
True,"Andersson, Miss. Erna Alexandra",female,,4,17.0,True,False,False,True,69
True,"Kink, Mr. Vincenz",male,,2,26.0,False,True,True,False,70
False,"Jenkin, Mr. Stephen Curnow",male,,0,32.0,False,True,True,False,71
True,"Goodwin, Miss. Lillian Amy",female,,5,16.0,True,False,False,False,72
False,"Hood, Mr. Ambrose Jr",male,,0,21.0,False,True,True,False,73
True,"Chronopoulos, Mr. Apostolos",male,,1,26.0,False,True,True,False,74
False,"Bing, Mr. Lee",male,,0,32.0,False,True,True,True,75
False,"Moen, Mr. Sigurd Hansen",male,F G73,0,25.0,False,True,True,False,76
False,"Staneff, Mr. Ivan",male,,0,,False,True,True,False,77
False,"Moutal, Mr. Rahamin Haim",male,,0,,False,True,True,False,78
False,"Caldwell, Master. Alden Gates",male,,0,0.83,False,True,False,True,79
False,"Dowdell, Miss. Elizabeth",female,,0,30.0,True,False,False,True,80
name,sibsp,is_male,cabin,age,passenger_id,survived,has_siblings,is_female,is_mr,sex
"Sirayanian, Mr. Orsen",0,True,,22.0,61,False,False,False,True,male
"Icard, Miss. Amelie",0,False,B28,38.0,62,True,False,True,False,female
"Harris, Mr. Henry Birkhardt",1,True,C83,45.0,63,False,True,False,True,male
"Skoog, Master. Harald",3,True,,4.0,64,False,True,False,False,male
"Stewart, Mr. Albert A",0,True,,,65,False,False,False,True,male
"Moubarek, Master. Gerios",1,True,,,66,True,True,False,False,male
"Nye, Mrs. (Elizabeth Ramell)",0,False,F33,29.0,67,True,False,True,True,female
"Crease, Mr. Ernest James",0,True,,19.0,68,False,False,False,True,male
"Andersson, Miss. Erna Alexandra",4,False,,17.0,69,True,True,True,False,female
"Kink, Mr. Vincenz",2,True,,26.0,70,False,True,False,True,male
"Jenkin, Mr. Stephen Curnow",0,True,,32.0,71,False,False,False,True,male
"Goodwin, Miss. Lillian Amy",5,False,,16.0,72,False,True,True,False,female
"Hood, Mr. Ambrose Jr",0,True,,21.0,73,False,False,False,True,male
"Chronopoulos, Mr. Apostolos",1,True,,26.0,74,False,True,False,True,male
"Bing, Mr. Lee",0,True,,32.0,75,True,False,False,True,male
"Moen, Mr. Sigurd Hansen",0,True,F G73,25.0,76,False,False,False,True,male
"Staneff, Mr. Ivan",0,True,,,77,False,False,False,True,male
"Moutal, Mr. Rahamin Haim",0,True,,,78,False,False,False,True,male
"Caldwell, Master. Alden Gates",0,True,,0.83,79,True,False,False,False,male
"Dowdell, Miss. Elizabeth",0,False,,30.0,80,True,False,True,False,female
Loading

0 comments on commit 0aa5364

Please sign in to comment.