From 43a710ff08da2b8b6b6827c9a2c82b6c1a013585 Mon Sep 17 00:00:00 2001
From: project-defiant <szymonszyszkowski@gmail.com>
Date: Tue, 3 Dec 2024 21:47:27 +0100
Subject: [PATCH 1/3] refactor: derive the _schema automatically

---
 src/gentropy/dataset/dataset.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/gentropy/dataset/dataset.py b/src/gentropy/dataset/dataset.py
index 67fe05eaf..2251a9a99 100644
--- a/src/gentropy/dataset/dataset.py
+++ b/src/gentropy/dataset/dataset.py
@@ -32,7 +32,6 @@ class Dataset(ABC):
     """
 
     _df: DataFrame
-    _schema: StructType
 
     def __post_init__(self: Dataset) -> None:
         """Post init."""
@@ -64,7 +63,7 @@ def schema(self: Dataset) -> StructType:
         Returns:
             StructType: Dataframe expected schema
         """
-        return self._schema
+        return self.get_schema()
 
     @classmethod
     def _process_class_params(
@@ -172,7 +171,7 @@ def validate_schema(self: Dataset) -> None:
         Raises:
             SchemaValidationError: If the DataFrame schema does not match the expected schema
         """
-        expected_schema = self._schema
+        expected_schema = self.schema
         observed_schema = self._df.schema
 
         # Unexpected fields in dataset

From c9e29013149646884018f390641211550ae9381d Mon Sep 17 00:00:00 2001
From: project-defiant <szymonszyszkowski@gmail.com>
Date: Tue, 17 Dec 2024 13:18:55 +0100
Subject: [PATCH 2/3] feat: allow for skipping _schema if get_schema is defined

---
 src/gentropy/dataset/dataset.py        | 34 +++++++++++++++++++-----
 src/gentropy/dataset/pairwise_ld.py    |  2 +-
 tests/gentropy/dataset/test_dataset.py | 36 ++++++++++++++++++--------
 3 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/src/gentropy/dataset/dataset.py b/src/gentropy/dataset/dataset.py
index 2251a9a99..7f85cd328 100644
--- a/src/gentropy/dataset/dataset.py
+++ b/src/gentropy/dataset/dataset.py
@@ -8,7 +8,9 @@
 from functools import reduce
 from typing import TYPE_CHECKING, Any
 
-import pyspark.sql.functions as f
+from pyspark.sql import DataFrame
+from pyspark.sql import functions as f
+from pyspark.sql import types as t
 from pyspark.sql.types import DoubleType
 from pyspark.sql.window import Window
 from typing_extensions import Self
@@ -18,7 +20,7 @@
 if TYPE_CHECKING:
     from enum import Enum
 
-    from pyspark.sql import Column, DataFrame
+    from pyspark.sql import Column
     from pyspark.sql.types import StructType
 
     from gentropy.common.session import Session
@@ -26,16 +28,34 @@
 
 @dataclass
 class Dataset(ABC):
-    """Open Targets Gentropy Dataset.
+    """Open Targets Gentropy Dataset Interface.
 
-    `Dataset` is a wrapper around a Spark DataFrame with a predefined schema. Schemas for each child dataset are described in the `schemas` module.
+    The `Dataset` interface is a wrapper around a Spark DataFrame with a predefined schema.
+    Class allows for overwriting the schema with `_schema` parameter.
+    If the `_schema` is not provided, the schema is inferred from the Dataset.get_schema specific
+    method which must be implemented by the child classes.
     """
 
     _df: DataFrame
+    _schema: StructType | None = None
 
     def __post_init__(self: Dataset) -> None:
-        """Post init."""
-        self.validate_schema()
+        """Post init.
+
+        Raises:
+            TypeError: If the type of the _df or _schema is not valid
+        """
+        match self._df:
+            case DataFrame():
+                pass
+            case _:
+                raise TypeError(f"Invalid type for _df: {type(self._df)}")
+
+        match self._schema:
+            case None | t.StructType():
+                self.validate_schema()
+            case _:
+                raise TypeError(f"Invalid type for _schema: {type(self._schema)}")
 
     @property
     def df(self: Dataset) -> DataFrame:
@@ -63,7 +83,7 @@ def schema(self: Dataset) -> StructType:
         Returns:
             StructType: Dataframe expected schema
         """
-        return self.get_schema()
+        return self._schema or self.get_schema()
 
     @classmethod
     def _process_class_params(
diff --git a/src/gentropy/dataset/pairwise_ld.py b/src/gentropy/dataset/pairwise_ld.py
index b64592094..ab68a74ab 100644
--- a/src/gentropy/dataset/pairwise_ld.py
+++ b/src/gentropy/dataset/pairwise_ld.py
@@ -38,7 +38,7 @@ def __post_init__(self: PairwiseLD) -> None:
         ), f"The number of rows in a pairwise LD table has to be square. Got: {row_count}"
 
         self.dimension = (int(sqrt(row_count)), int(sqrt(row_count)))
-        self.validate_schema()
+        super().__post_init__()
 
     @classmethod
     def get_schema(cls: type[PairwiseLD]) -> StructType:
diff --git a/tests/gentropy/dataset/test_dataset.py b/tests/gentropy/dataset/test_dataset.py
index 7c61f3f52..6f1fa0051 100644
--- a/tests/gentropy/dataset/test_dataset.py
+++ b/tests/gentropy/dataset/test_dataset.py
@@ -21,32 +21,44 @@ def get_schema(cls) -> StructType:
         return StructType([StructField("value", IntegerType(), False)])
 
 
-class TestCoalesceAndRepartition:
+class TestDataset:
     """Test TestDataset.coalesce and TestDataset.repartition."""
 
-    def test_repartition(self: TestCoalesceAndRepartition) -> None:
+    def test_repartition(self: TestDataset) -> None:
         """Test Dataset.repartition."""
         initial_partitions = self.test_dataset._df.rdd.getNumPartitions()
         new_partitions = initial_partitions + 1
         self.test_dataset.repartition(new_partitions)
         assert self.test_dataset._df.rdd.getNumPartitions() == new_partitions
 
-    def test_coalesce(self: TestCoalesceAndRepartition) -> None:
+    def test_coalesce(self: TestDataset) -> None:
         """Test Dataset.coalesce."""
         initial_partitions = self.test_dataset._df.rdd.getNumPartitions()
         new_partitions = initial_partitions - 1 if initial_partitions > 1 else 1
         self.test_dataset.coalesce(new_partitions)
         assert self.test_dataset._df.rdd.getNumPartitions() == new_partitions
 
+    def test_initialize_without_schema(self: TestDataset, spark: SparkSession) -> None:
+        """Test if Dataset derived class collects the schema from assets if schema is not provided."""
+        df = spark.createDataFrame([(1,)], schema=MockDataset.get_schema())
+        ds = MockDataset(_df=df)
+        assert (
+            ds.schema == MockDataset.get_schema()
+        ), "Schema should be inferred from df"
+
+    def test_passing_incorrect_types(self: TestDataset, spark: SparkSession) -> None:
+        """Test if passing incorrect object types to Dataset raises an error."""
+        with pytest.raises(TypeError):
+            MockDataset(_df="not a dataframe")
+        with pytest.raises(TypeError):
+            MockDataset(_df=self.df, _schema="not a schema")
+
     @pytest.fixture(autouse=True)
-    def _setup(self: TestCoalesceAndRepartition, spark: SparkSession) -> None:
+    def _setup(self: TestDataset, spark: SparkSession) -> None:
         """Setup fixture."""
-        self.test_dataset = MockDataset(
-            _df=spark.createDataFrame(
-                [(1,), (2,), (3,)], schema=MockDataset.get_schema()
-            ),
-            _schema=MockDataset.get_schema(),
-        )
+        df = spark.createDataFrame([(1,), (2,), (3,)], schema=MockDataset.get_schema())
+        self.df = df
+        self.test_dataset = MockDataset(_df=df, _schema=MockDataset.get_schema())
 
 
 def test_dataset_filter(mock_study_index: StudyIndex) -> None:
@@ -68,6 +80,8 @@ def test_dataset_drop_infinity_values() -> None:
     rows = [(v,) for v in data]
     schema = StructType([StructField("field", DoubleType())])
     input_df = spark.createDataFrame(rows, schema=schema)
+    input_df.printSchema()
+
     assert input_df.count() == 7
     # run without specifying *cols results in no filtering
     ds = MockDataset(_df=input_df, _schema=schema)
@@ -76,7 +90,7 @@ def test_dataset_drop_infinity_values() -> None:
     assert ds.drop_infinity_values("field").df.count() == 1
 
 
-def test__process_class_params(spark: SparkSession) -> None:
+def test_process_class_params(spark: SparkSession) -> None:
     """Test splitting of parameters between class and spark parameters."""
     params = {
         "_df": spark.createDataFrame([(1,)], schema=MockDataset.get_schema()),

From 897b1f0556ca62979f68bc302ae4021e88dabc7d Mon Sep 17 00:00:00 2001
From: Szymon Szyszkowski <ss60@mib117351s.internal.sanger.ac.uk>
Date: Wed, 18 Dec 2024 14:32:32 +0000
Subject: [PATCH 3/3] chore: comments

---
 src/gentropy/dataset/dataset.py        | 3 +--
 tests/gentropy/dataset/test_dataset.py | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/gentropy/dataset/dataset.py b/src/gentropy/dataset/dataset.py
index 7f85cd328..fa06faec2 100644
--- a/src/gentropy/dataset/dataset.py
+++ b/src/gentropy/dataset/dataset.py
@@ -11,7 +11,6 @@
 from pyspark.sql import DataFrame
 from pyspark.sql import functions as f
 from pyspark.sql import types as t
-from pyspark.sql.types import DoubleType
 from pyspark.sql.window import Window
 from typing_extensions import Self
 
@@ -263,7 +262,7 @@ def drop_infinity_values(self: Self, *cols: str) -> Self:
         if len(cols) == 0:
             return self
         inf_strings = ("Inf", "+Inf", "-Inf", "Infinity", "+Infinity", "-Infinity")
-        inf_values = [f.lit(v).cast(DoubleType()) for v in inf_strings]
+        inf_values = [f.lit(v).cast(t.DoubleType()) for v in inf_strings]
         conditions = [f.col(c).isin(inf_values) for c in cols]
         # reduce individual filter expressions with or statement
         # to col("beta").isin([lit(Inf)]) | col("beta").isin([lit(Inf)])...
diff --git a/tests/gentropy/dataset/test_dataset.py b/tests/gentropy/dataset/test_dataset.py
index 6f1fa0051..96a96ec27 100644
--- a/tests/gentropy/dataset/test_dataset.py
+++ b/tests/gentropy/dataset/test_dataset.py
@@ -80,7 +80,6 @@ def test_dataset_drop_infinity_values() -> None:
     rows = [(v,) for v in data]
     schema = StructType([StructField("field", DoubleType())])
     input_df = spark.createDataFrame(rows, schema=schema)
-    input_df.printSchema()
 
     assert input_df.count() == 7
     # run without specifying *cols results in no filtering