From 43a710ff08da2b8b6b6827c9a2c82b6c1a013585 Mon Sep 17 00:00:00 2001 From: project-defiant Date: Tue, 3 Dec 2024 21:47:27 +0100 Subject: [PATCH 1/3] refactor: derive the _schema automatically --- src/gentropy/dataset/dataset.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/gentropy/dataset/dataset.py b/src/gentropy/dataset/dataset.py index 67fe05eaf..2251a9a99 100644 --- a/src/gentropy/dataset/dataset.py +++ b/src/gentropy/dataset/dataset.py @@ -32,7 +32,6 @@ class Dataset(ABC): """ _df: DataFrame - _schema: StructType def __post_init__(self: Dataset) -> None: """Post init.""" @@ -64,7 +63,7 @@ def schema(self: Dataset) -> StructType: Returns: StructType: Dataframe expected schema """ - return self._schema + return self.get_schema() @classmethod def _process_class_params( @@ -172,7 +171,7 @@ def validate_schema(self: Dataset) -> None: Raises: SchemaValidationError: If the DataFrame schema does not match the expected schema """ - expected_schema = self._schema + expected_schema = self.schema observed_schema = self._df.schema # Unexpected fields in dataset From c9e29013149646884018f390641211550ae9381d Mon Sep 17 00:00:00 2001 From: project-defiant Date: Tue, 17 Dec 2024 13:18:55 +0100 Subject: [PATCH 2/3] feat: allow for skipping _schema if get_schema is defined --- src/gentropy/dataset/dataset.py | 34 +++++++++++++++++++----- src/gentropy/dataset/pairwise_ld.py | 2 +- tests/gentropy/dataset/test_dataset.py | 36 ++++++++++++++++++-------- 3 files changed, 53 insertions(+), 19 deletions(-) diff --git a/src/gentropy/dataset/dataset.py b/src/gentropy/dataset/dataset.py index 2251a9a99..7f85cd328 100644 --- a/src/gentropy/dataset/dataset.py +++ b/src/gentropy/dataset/dataset.py @@ -8,7 +8,9 @@ from functools import reduce from typing import TYPE_CHECKING, Any -import pyspark.sql.functions as f +from pyspark.sql import DataFrame +from pyspark.sql import functions as f +from pyspark.sql import types as t from pyspark.sql.types import DoubleType from pyspark.sql.window import Window from typing_extensions import Self @@ -18,7 +20,7 @@ if TYPE_CHECKING: from enum import Enum - from pyspark.sql import Column, DataFrame + from pyspark.sql import Column from pyspark.sql.types import StructType from gentropy.common.session import Session @@ -26,16 +28,34 @@ @dataclass class Dataset(ABC): - """Open Targets Gentropy Dataset. + """Open Targets Gentropy Dataset Interface. - `Dataset` is a wrapper around a Spark DataFrame with a predefined schema. Schemas for each child dataset are described in the `schemas` module. + The `Dataset` interface is a wrapper around a Spark DataFrame with a predefined schema. + Class allows for overwriting the schema with `_schema` parameter. + If the `_schema` is not provided, the schema is inferred from the Dataset.get_schema specific + method which must be implemented by the child classes. """ _df: DataFrame + _schema: StructType | None = None def __post_init__(self: Dataset) -> None: - """Post init.""" - self.validate_schema() + """Post init. + + Raises: + TypeError: If the type of the _df or _schema is not valid + """ + match self._df: + case DataFrame(): + pass + case _: + raise TypeError(f"Invalid type for _df: {type(self._df)}") + + match self._schema: + case None | t.StructType(): + self.validate_schema() + case _: + raise TypeError(f"Invalid type for _schema: {type(self._schema)}") @property def df(self: Dataset) -> DataFrame: @@ -63,7 +83,7 @@ def schema(self: Dataset) -> StructType: Returns: StructType: Dataframe expected schema """ - return self.get_schema() + return self._schema or self.get_schema() @classmethod def _process_class_params( diff --git a/src/gentropy/dataset/pairwise_ld.py b/src/gentropy/dataset/pairwise_ld.py index b64592094..ab68a74ab 100644 --- a/src/gentropy/dataset/pairwise_ld.py +++ b/src/gentropy/dataset/pairwise_ld.py @@ -38,7 +38,7 @@ def __post_init__(self: PairwiseLD) -> None: ), f"The number of rows in a pairwise LD table has to be square. Got: {row_count}" self.dimension = (int(sqrt(row_count)), int(sqrt(row_count))) - self.validate_schema() + super().__post_init__() @classmethod def get_schema(cls: type[PairwiseLD]) -> StructType: diff --git a/tests/gentropy/dataset/test_dataset.py b/tests/gentropy/dataset/test_dataset.py index 7c61f3f52..6f1fa0051 100644 --- a/tests/gentropy/dataset/test_dataset.py +++ b/tests/gentropy/dataset/test_dataset.py @@ -21,32 +21,44 @@ def get_schema(cls) -> StructType: return StructType([StructField("value", IntegerType(), False)]) -class TestCoalesceAndRepartition: +class TestDataset: """Test TestDataset.coalesce and TestDataset.repartition.""" - def test_repartition(self: TestCoalesceAndRepartition) -> None: + def test_repartition(self: TestDataset) -> None: """Test Dataset.repartition.""" initial_partitions = self.test_dataset._df.rdd.getNumPartitions() new_partitions = initial_partitions + 1 self.test_dataset.repartition(new_partitions) assert self.test_dataset._df.rdd.getNumPartitions() == new_partitions - def test_coalesce(self: TestCoalesceAndRepartition) -> None: + def test_coalesce(self: TestDataset) -> None: """Test Dataset.coalesce.""" initial_partitions = self.test_dataset._df.rdd.getNumPartitions() new_partitions = initial_partitions - 1 if initial_partitions > 1 else 1 self.test_dataset.coalesce(new_partitions) assert self.test_dataset._df.rdd.getNumPartitions() == new_partitions + def test_initialize_without_schema(self: TestDataset, spark: SparkSession) -> None: + """Test if Dataset derived class collects the schema from assets if schema is not provided.""" + df = spark.createDataFrame([(1,)], schema=MockDataset.get_schema()) + ds = MockDataset(_df=df) + assert ( + ds.schema == MockDataset.get_schema() + ), "Schema should be inferred from df" + + def test_passing_incorrect_types(self: TestDataset, spark: SparkSession) -> None: + """Test if passing incorrect object types to Dataset raises an error.""" + with pytest.raises(TypeError): + MockDataset(_df="not a dataframe") + with pytest.raises(TypeError): + MockDataset(_df=self.df, _schema="not a schema") + @pytest.fixture(autouse=True) - def _setup(self: TestCoalesceAndRepartition, spark: SparkSession) -> None: + def _setup(self: TestDataset, spark: SparkSession) -> None: """Setup fixture.""" - self.test_dataset = MockDataset( - _df=spark.createDataFrame( - [(1,), (2,), (3,)], schema=MockDataset.get_schema() - ), - _schema=MockDataset.get_schema(), - ) + df = spark.createDataFrame([(1,), (2,), (3,)], schema=MockDataset.get_schema()) + self.df = df + self.test_dataset = MockDataset(_df=df, _schema=MockDataset.get_schema()) def test_dataset_filter(mock_study_index: StudyIndex) -> None: @@ -68,6 +80,8 @@ def test_dataset_drop_infinity_values() -> None: rows = [(v,) for v in data] schema = StructType([StructField("field", DoubleType())]) input_df = spark.createDataFrame(rows, schema=schema) + input_df.printSchema() + assert input_df.count() == 7 # run without specifying *cols results in no filtering ds = MockDataset(_df=input_df, _schema=schema) @@ -76,7 +90,7 @@ def test_dataset_drop_infinity_values() -> None: assert ds.drop_infinity_values("field").df.count() == 1 -def test__process_class_params(spark: SparkSession) -> None: +def test_process_class_params(spark: SparkSession) -> None: """Test splitting of parameters between class and spark parameters.""" params = { "_df": spark.createDataFrame([(1,)], schema=MockDataset.get_schema()), From 897b1f0556ca62979f68bc302ae4021e88dabc7d Mon Sep 17 00:00:00 2001 From: Szymon Szyszkowski Date: Wed, 18 Dec 2024 14:32:32 +0000 Subject: [PATCH 3/3] chore: comments --- src/gentropy/dataset/dataset.py | 3 +-- tests/gentropy/dataset/test_dataset.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/gentropy/dataset/dataset.py b/src/gentropy/dataset/dataset.py index 7f85cd328..fa06faec2 100644 --- a/src/gentropy/dataset/dataset.py +++ b/src/gentropy/dataset/dataset.py @@ -11,7 +11,6 @@ from pyspark.sql import DataFrame from pyspark.sql import functions as f from pyspark.sql import types as t -from pyspark.sql.types import DoubleType from pyspark.sql.window import Window from typing_extensions import Self @@ -263,7 +262,7 @@ def drop_infinity_values(self: Self, *cols: str) -> Self: if len(cols) == 0: return self inf_strings = ("Inf", "+Inf", "-Inf", "Infinity", "+Infinity", "-Infinity") - inf_values = [f.lit(v).cast(DoubleType()) for v in inf_strings] + inf_values = [f.lit(v).cast(t.DoubleType()) for v in inf_strings] conditions = [f.col(c).isin(inf_values) for c in cols] # reduce individual filter expressions with or statement # to col("beta").isin([lit(Inf)]) | col("beta").isin([lit(Inf)])... diff --git a/tests/gentropy/dataset/test_dataset.py b/tests/gentropy/dataset/test_dataset.py index 6f1fa0051..96a96ec27 100644 --- a/tests/gentropy/dataset/test_dataset.py +++ b/tests/gentropy/dataset/test_dataset.py @@ -80,7 +80,6 @@ def test_dataset_drop_infinity_values() -> None: rows = [(v,) for v in data] schema = StructType([StructField("field", DoubleType())]) input_df = spark.createDataFrame(rows, schema=schema) - input_df.printSchema() assert input_df.count() == 7 # run without specifying *cols results in no filtering