From a865481dc9f92e4b7b01322e7a0d63ea41fc19ec Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Mon, 21 Oct 2024 16:49:45 +0200
Subject: [PATCH 1/2] fix val_data mistake in basemodules

---
 mambular/data_utils/datamodule.py          | 598 +++++++++++----------
 mambular/models/sklearn_base_classifier.py |   4 +-
 mambular/models/sklearn_base_lss.py        |   4 +-
 mambular/models/sklearn_base_regressor.py  |   4 +-
 4 files changed, 307 insertions(+), 303 deletions(-)

diff --git a/mambular/data_utils/datamodule.py b/mambular/data_utils/datamodule.py
index 14fc1e32..adb59c76 100644
--- a/mambular/data_utils/datamodule.py
+++ b/mambular/data_utils/datamodule.py
@@ -1,297 +1,301 @@
-import torch
-import pandas as pd
-import numpy as np
-import lightning as pl
-from torch.utils.data import DataLoader
-from sklearn.model_selection import train_test_split
-from .dataset import MambularDataset
-
-
-class MambularDataModule(pl.LightningDataModule):
-    """
-    A PyTorch Lightning data module for managing training and validation data loaders in a structured way.
-
-    This class simplifies the process of batch-wise data loading for training and validation datasets during
-    the training loop, and is particularly useful when working with PyTorch Lightning's training framework.
-
-    Parameters:
-        preprocessor: object
-            An instance of your preprocessor class.
-        batch_size: int
-            Size of batches for the DataLoader.
-        shuffle: bool
-            Whether to shuffle the training data in the DataLoader.
-        X_val: DataFrame or None, optional
-            Validation features. If None, uses train-test split.
-        y_val: array-like or None, optional
-            Validation labels. If None, uses train-test split.
-        val_size: float, optional
-            Proportion of data to include in the validation split if `X_val` and `y_val` are None.
-        random_state: int, optional
-            Random seed for reproducibility in data splitting.
-        regression: bool, optional
-            Whether the problem is regression (True) or classification (False).
-    """
-
-    def __init__(
-        self,
-        preprocessor,
-        batch_size,
-        shuffle,
-        regression,
-        X_val=None,
-        y_val=None,
-        val_size=0.2,
-        random_state=101,
-        **dataloader_kwargs,
-    ):
-        """
-        Initialize the data module with the specified preprocessor, batch size, shuffle option,
-        and optional validation data settings.
-
-        Args:
-            preprocessor (object): An instance of the preprocessor class for data preprocessing.
-            batch_size (int): Size of batches for the DataLoader.
-            shuffle (bool): Whether to shuffle the training data in the DataLoader.
-            X_val (DataFrame or None, optional): Validation features. If None, uses train-test split.
-            y_val (array-like or None, optional): Validation labels. If None, uses train-test split.
-            val_size (float, optional): Proportion of data to include in the validation split if `X_val` and `y_val` are None.
-            random_state (int, optional): Random seed for reproducibility in data splitting.
-            regression (bool, optional): Whether the problem is regression (True) or classification (False).
-        """
-        super().__init__()
-        self.preprocessor = preprocessor
-        self.batch_size = batch_size
-        self.shuffle = shuffle
-        self.cat_feature_info = None
-        self.num_feature_info = None
-        self.X_val = X_val
-        self.y_val = y_val
-        self.val_size = val_size
-        self.random_state = random_state
-        self.regression = regression
-        if self.regression:
-            self.labels_dtype = torch.float32
-        else:
-            self.labels_dtype = torch.long
-
-        # Initialize placeholders for data
-        self.X_train = None
-        self.y_train = None
-        self.test_preprocessor_fitted = False
-        self.dataloader_kwargs = dataloader_kwargs
-
-    def preprocess_data(
-        self,
-        X_train,
-        y_train,
-        X_val=None,
-        y_val=None,
-        val_size=0.2,
-        random_state=101,
-    ):
-        """
-        Preprocesses the training and validation data.
-
-        Parameters
-        ----------
-        X_train : DataFrame or array-like, shape (n_samples_train, n_features)
-            Training feature set.
-        y_train : array-like, shape (n_samples_train,)
-            Training target values.
-        X_val : DataFrame or array-like, shape (n_samples_val, n_features), optional
-            Validation feature set. If None, a validation set will be created from `X_train`.
-        y_val : array-like, shape (n_samples_val,), optional
-            Validation target values. If None, a validation set will be created from `y_train`.
-        val_size : float, optional
-            Proportion of data to include in the validation split if `X_val` and `y_val` are None.
-        random_state : int, optional
-            Random seed for reproducibility in data splitting.
-
-        Returns
-        -------
-        None
-        """
-
-        if X_val is None or y_val is None:
-            self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
-                X_train, y_train, test_size=val_size, random_state=random_state
-            )
-        else:
-            self.X_train = X_train
-            self.y_train = y_train
-            self.X_val = X_val
-            self.y_val = y_val
-
-        # Fit the preprocessor on the combined training and validation data
-        combined_X = pd.concat([self.X_train, self.X_val], axis=0).reset_index(
-            drop=True
-        )
-        combined_y = np.concatenate((self.y_train, self.y_val), axis=0)
-
-        # Fit the preprocessor
-        self.preprocessor.fit(combined_X, combined_y)
-
-        # Update feature info based on the actual processed data
-        (
-            self.cat_feature_info,
-            self.num_feature_info,
-        ) = self.preprocessor.get_feature_info()
-
-    def setup(self, stage: str):
-        """
-        Transform the data and create DataLoaders.
-        """
-        if stage == "fit":
-            train_preprocessed_data = self.preprocessor.transform(self.X_train)
-            val_preprocessed_data = self.preprocessor.transform(self.X_val)
-
-            # Initialize lists for tensors
-            train_cat_tensors = []
-            train_num_tensors = []
-            val_cat_tensors = []
-            val_num_tensors = []
-
-            # Populate tensors for categorical features, if present in processed data
-            for key in self.cat_feature_info:
-                cat_key = (
-                    "cat_" + key
-                )  # Assuming categorical keys are prefixed with 'cat_'
-                if cat_key in train_preprocessed_data:
-                    train_cat_tensors.append(
-                        torch.tensor(train_preprocessed_data[cat_key], dtype=torch.long)
-                    )
-                if cat_key in val_preprocessed_data:
-                    val_cat_tensors.append(
-                        torch.tensor(val_preprocessed_data[cat_key], dtype=torch.long)
-                    )
-
-                binned_key = "num_" + key  # for binned features
-                if binned_key in train_preprocessed_data:
-                    train_cat_tensors.append(
-                        torch.tensor(
-                            train_preprocessed_data[binned_key], dtype=torch.long
-                        )
-                    )
-
-                if binned_key in val_preprocessed_data:
-                    val_cat_tensors.append(
-                        torch.tensor(
-                            val_preprocessed_data[binned_key], dtype=torch.long
-                        )
-                    )
-
-            # Populate tensors for numerical features, if present in processed data
-            for key in self.num_feature_info:
-                num_key = (
-                    "num_" + key
-                )  # Assuming numerical keys are prefixed with 'num_'
-                if num_key in train_preprocessed_data:
-                    train_num_tensors.append(
-                        torch.tensor(
-                            train_preprocessed_data[num_key], dtype=torch.float32
-                        )
-                    )
-                if num_key in val_preprocessed_data:
-                    val_num_tensors.append(
-                        torch.tensor(
-                            val_preprocessed_data[num_key], dtype=torch.float32
-                        )
-                    )
-
-            train_labels = torch.tensor(
-                self.y_train, dtype=self.labels_dtype
-            ).unsqueeze(dim=1)
-            val_labels = torch.tensor(self.y_val, dtype=self.labels_dtype).unsqueeze(
-                dim=1
-            )
-
-            # Create datasets
-            self.train_dataset = MambularDataset(
-                train_cat_tensors,
-                train_num_tensors,
-                train_labels,
-                regression=self.regression,
-            )
-            self.val_dataset = MambularDataset(
-                val_cat_tensors, val_num_tensors, val_labels, regression=self.regression
-            )
-        elif stage == "test":
-            if not self.test_preprocessor_fitted:
-                raise ValueError(
-                    "The preprocessor has not been fitted. Please fit the preprocessor before transforming the test data."
-                )
-
-            self.test_dataset = MambularDataset(
-                self.test_cat_tensors,
-                self.test_num_tensors,
-                train_labels,
-                regression=self.regression,
-            )
-
-    def preprocess_test_data(self, X):
-        self.test_cat_tensors = []
-        self.test_num_tensors = []
-        test_preprocessed_data = self.preprocessor.transform(X)
-
-        # Populate tensors for categorical features, if present in processed data
-        for key in self.cat_feature_info:
-            cat_key = "cat_" + key  # Assuming categorical keys are prefixed with 'cat_'
-            if cat_key in test_preprocessed_data:
-                self.test_cat_tensors.append(
-                    torch.tensor(test_preprocessed_data[cat_key], dtype=torch.long)
-                )
-
-            binned_key = "num_" + key  # for binned features
-            if binned_key in test_preprocessed_data:
-                self.test_cat_tensors.append(
-                    torch.tensor(test_preprocessed_data[binned_key], dtype=torch.long)
-                )
-
-        # Populate tensors for numerical features, if present in processed data
-        for key in self.num_feature_info:
-            num_key = "num_" + key  # Assuming numerical keys are prefixed with 'num_'
-            if num_key in test_preprocessed_data:
-                self.test_num_tensors.append(
-                    torch.tensor(test_preprocessed_data[num_key], dtype=torch.float32)
-                )
-
-        self.test_preprocessor_fitted = True
-        return self.test_cat_tensors, self.test_num_tensors
-
-    def train_dataloader(self):
-        """
-        Returns the training dataloader.
-
-        Returns:
-            DataLoader: DataLoader instance for the training dataset.
-        """
-
-        return DataLoader(
-            self.train_dataset,
-            batch_size=self.batch_size,
-            shuffle=self.shuffle,
-            **self.dataloader_kwargs,
-        )
-
-    def val_dataloader(self):
-        """
-        Returns the validation dataloader.
-
-        Returns:
-            DataLoader: DataLoader instance for the validation dataset.
-        """
-        return DataLoader(
-            self.val_dataset, batch_size=self.batch_size, **self.dataloader_kwargs
-        )
-
-    def test_dataloader(self):
-        """
-        Returns the test dataloader.
-
-        Returns:
-            DataLoader: DataLoader instance for the test dataset.
-        """
-        return DataLoader(
-            self.test_dataset, batch_size=self.batch_size, **self.dataloader_kwargs
-        )
+import torch
+import pandas as pd
+import numpy as np
+import lightning as pl
+from torch.utils.data import DataLoader
+from sklearn.model_selection import train_test_split
+from .dataset import MambularDataset
+
+
+class MambularDataModule(pl.LightningDataModule):
+    """
+    A PyTorch Lightning data module for managing training and validation data loaders in a structured way.
+
+    This class simplifies the process of batch-wise data loading for training and validation datasets during
+    the training loop, and is particularly useful when working with PyTorch Lightning's training framework.
+
+    Parameters:
+        preprocessor: object
+            An instance of your preprocessor class.
+        batch_size: int
+            Size of batches for the DataLoader.
+        shuffle: bool
+            Whether to shuffle the training data in the DataLoader.
+        X_val: DataFrame or None, optional
+            Validation features. If None, uses train-test split.
+        y_val: array-like or None, optional
+            Validation labels. If None, uses train-test split.
+        val_size: float, optional
+            Proportion of data to include in the validation split if `X_val` and `y_val` are None.
+        random_state: int, optional
+            Random seed for reproducibility in data splitting.
+        regression: bool, optional
+            Whether the problem is regression (True) or classification (False).
+    """
+
+    def __init__(
+        self,
+        preprocessor,
+        batch_size,
+        shuffle,
+        regression,
+        X_val=None,
+        y_val=None,
+        val_size=0.2,
+        random_state=101,
+        **dataloader_kwargs,
+    ):
+        """
+        Initialize the data module with the specified preprocessor, batch size, shuffle option,
+        and optional validation data settings.
+
+        Args:
+            preprocessor (object): An instance of the preprocessor class for data preprocessing.
+            batch_size (int): Size of batches for the DataLoader.
+            shuffle (bool): Whether to shuffle the training data in the DataLoader.
+            X_val (DataFrame or None, optional): Validation features. If None, uses train-test split.
+            y_val (array-like or None, optional): Validation labels. If None, uses train-test split.
+            val_size (float, optional): Proportion of data to include in the validation split if `X_val` and `y_val` are None.
+            random_state (int, optional): Random seed for reproducibility in data splitting.
+            regression (bool, optional): Whether the problem is regression (True) or classification (False).
+        """
+        super().__init__()
+        self.preprocessor = preprocessor
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.cat_feature_info = None
+        self.num_feature_info = None
+        self.X_val = X_val
+        self.y_val = y_val
+        self.val_size = val_size
+        self.random_state = random_state
+        self.regression = regression
+        if self.regression:
+            self.labels_dtype = torch.float32
+        else:
+            self.labels_dtype = torch.long
+
+        # Initialize placeholders for data
+        self.X_train = None
+        self.y_train = None
+        self.test_preprocessor_fitted = False
+        self.dataloader_kwargs = dataloader_kwargs
+
+    def preprocess_data(
+        self,
+        X_train,
+        y_train,
+        X_val=None,
+        y_val=None,
+        val_size=0.2,
+        random_state=101,
+    ):
+        """
+        Preprocesses the training and validation data.
+
+        Parameters
+        ----------
+        X_train : DataFrame or array-like, shape (n_samples_train, n_features)
+            Training feature set.
+        y_train : array-like, shape (n_samples_train,)
+            Training target values.
+        X_val : DataFrame or array-like, shape (n_samples_val, n_features), optional
+            Validation feature set. If None, a validation set will be created from `X_train`.
+        y_val : array-like, shape (n_samples_val,), optional
+            Validation target values. If None, a validation set will be created from `y_train`.
+        val_size : float, optional
+            Proportion of data to include in the validation split if `X_val` and `y_val` are None.
+        random_state : int, optional
+            Random seed for reproducibility in data splitting.
+
+        Returns
+        -------
+        None
+        """
+
+        if X_val is None or y_val is None:
+            self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
+                X_train, y_train, test_size=val_size, random_state=random_state
+            )
+        else:
+            self.X_train = X_train
+            self.y_train = y_train
+            self.X_val = X_val
+            self.y_val = y_val
+
+        # Fit the preprocessor on the combined training and validation data
+        combined_X = pd.concat([self.X_train, self.X_val], axis=0).reset_index(
+            drop=True
+        )
+        combined_y = np.concatenate((self.y_train, self.y_val), axis=0)
+
+        # Fit the preprocessor
+        self.preprocessor.fit(combined_X, combined_y)
+
+        # Update feature info based on the actual processed data
+        (
+            self.cat_feature_info,
+            self.num_feature_info,
+        ) = self.preprocessor.get_feature_info()
+
+    def setup(self, stage: str):
+        """
+        Transform the data and create DataLoaders.
+        """
+        if stage == "fit":
+            train_preprocessed_data = self.preprocessor.transform(self.X_train)
+            val_preprocessed_data = self.preprocessor.transform(self.X_val)
+
+            # Initialize lists for tensors
+            train_cat_tensors = []
+            train_num_tensors = []
+            val_cat_tensors = []
+            val_num_tensors = []
+
+            # Populate tensors for categorical features, if present in processed data
+            for key in self.cat_feature_info:
+                cat_key = "cat_" + str(
+                    key
+                )  # Assuming categorical keys are prefixed with 'cat_'
+                if cat_key in train_preprocessed_data:
+                    train_cat_tensors.append(
+                        torch.tensor(train_preprocessed_data[cat_key], dtype=torch.long)
+                    )
+                if cat_key in val_preprocessed_data:
+                    val_cat_tensors.append(
+                        torch.tensor(val_preprocessed_data[cat_key], dtype=torch.long)
+                    )
+
+                binned_key = "num_" + str(key)  # for binned features
+                if binned_key in train_preprocessed_data:
+                    train_cat_tensors.append(
+                        torch.tensor(
+                            train_preprocessed_data[binned_key], dtype=torch.long
+                        )
+                    )
+
+                if binned_key in val_preprocessed_data:
+                    val_cat_tensors.append(
+                        torch.tensor(
+                            val_preprocessed_data[binned_key], dtype=torch.long
+                        )
+                    )
+
+            # Populate tensors for numerical features, if present in processed data
+            for key in self.num_feature_info:
+                num_key = "num_" + str(
+                    key
+                )  # Assuming numerical keys are prefixed with 'num_'
+                if num_key in train_preprocessed_data:
+                    train_num_tensors.append(
+                        torch.tensor(
+                            train_preprocessed_data[num_key], dtype=torch.float32
+                        )
+                    )
+                if num_key in val_preprocessed_data:
+                    val_num_tensors.append(
+                        torch.tensor(
+                            val_preprocessed_data[num_key], dtype=torch.float32
+                        )
+                    )
+
+            train_labels = torch.tensor(
+                self.y_train, dtype=self.labels_dtype
+            ).unsqueeze(dim=1)
+            val_labels = torch.tensor(self.y_val, dtype=self.labels_dtype).unsqueeze(
+                dim=1
+            )
+
+            # Create datasets
+            self.train_dataset = MambularDataset(
+                train_cat_tensors,
+                train_num_tensors,
+                train_labels,
+                regression=self.regression,
+            )
+            self.val_dataset = MambularDataset(
+                val_cat_tensors, val_num_tensors, val_labels, regression=self.regression
+            )
+        elif stage == "test":
+            if not self.test_preprocessor_fitted:
+                raise ValueError(
+                    "The preprocessor has not been fitted. Please fit the preprocessor before transforming the test data."
+                )
+
+            self.test_dataset = MambularDataset(
+                self.test_cat_tensors,
+                self.test_num_tensors,
+                train_labels,
+                regression=self.regression,
+            )
+
+    def preprocess_test_data(self, X):
+        self.test_cat_tensors = []
+        self.test_num_tensors = []
+        test_preprocessed_data = self.preprocessor.transform(X)
+
+        # Populate tensors for categorical features, if present in processed data
+        for key in self.cat_feature_info:
+            cat_key = "cat_" + str(
+                key
+            )  # Assuming categorical keys are prefixed with 'cat_'
+            if cat_key in test_preprocessed_data:
+                self.test_cat_tensors.append(
+                    torch.tensor(test_preprocessed_data[cat_key], dtype=torch.long)
+                )
+
+            binned_key = "num_" + str(key)  # for binned features
+            if binned_key in test_preprocessed_data:
+                self.test_cat_tensors.append(
+                    torch.tensor(test_preprocessed_data[binned_key], dtype=torch.long)
+                )
+
+        # Populate tensors for numerical features, if present in processed data
+        for key in self.num_feature_info:
+            num_key = "num_" + str(
+                key
+            )  # Assuming numerical keys are prefixed with 'num_'
+            if num_key in test_preprocessed_data:
+                self.test_num_tensors.append(
+                    torch.tensor(test_preprocessed_data[num_key], dtype=torch.float32)
+                )
+
+        self.test_preprocessor_fitted = True
+        return self.test_cat_tensors, self.test_num_tensors
+
+    def train_dataloader(self):
+        """
+        Returns the training dataloader.
+
+        Returns:
+            DataLoader: DataLoader instance for the training dataset.
+        """
+
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            **self.dataloader_kwargs,
+        )
+
+    def val_dataloader(self):
+        """
+        Returns the validation dataloader.
+
+        Returns:
+            DataLoader: DataLoader instance for the validation dataset.
+        """
+        return DataLoader(
+            self.val_dataset, batch_size=self.batch_size, **self.dataloader_kwargs
+        )
+
+    def test_dataloader(self):
+        """
+        Returns the test dataloader.
+
+        Returns:
+            DataLoader: DataLoader instance for the test dataset.
+        """
+        return DataLoader(
+            self.test_dataset, batch_size=self.batch_size, **self.dataloader_kwargs
+        )
diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index 0c7e30f9..1e903cba 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -170,7 +170,7 @@ def build_model(
             X = pd.DataFrame(X)
         if isinstance(y, pd.Series):
             y = y.values
-        if X_val:
+        if X_val is not None:
             if not isinstance(X_val, pd.DataFrame):
                 X_val = pd.DataFrame(X_val)
             if isinstance(y_val, pd.Series):
@@ -321,7 +321,7 @@ def fit(
                 X = pd.DataFrame(X)
             if isinstance(y, pd.Series):
                 y = y.values
-            if X_val:
+            if X_val is not None:
                 if not isinstance(X_val, pd.DataFrame):
                     X_val = pd.DataFrame(X_val)
                 if isinstance(y_val, pd.Series):
diff --git a/mambular/models/sklearn_base_lss.py b/mambular/models/sklearn_base_lss.py
index 15943834..40d229cc 100644
--- a/mambular/models/sklearn_base_lss.py
+++ b/mambular/models/sklearn_base_lss.py
@@ -189,7 +189,7 @@ def build_model(
             X = pd.DataFrame(X)
         if isinstance(y, pd.Series):
             y = y.values
-        if X_val:
+        if X_val is not None:
             if not isinstance(X_val, pd.DataFrame):
                 X_val = pd.DataFrame(X_val)
             if isinstance(y_val, pd.Series):
@@ -361,7 +361,7 @@ def fit(
             X = pd.DataFrame(X)
         if isinstance(y, pd.Series):
             y = y.values
-        if X_val:
+        if X_val is not None:
             if not isinstance(X_val, pd.DataFrame):
                 X_val = pd.DataFrame(X_val)
             if isinstance(y_val, pd.Series):
diff --git a/mambular/models/sklearn_base_regressor.py b/mambular/models/sklearn_base_regressor.py
index 1a098ac5..dbbf6040 100644
--- a/mambular/models/sklearn_base_regressor.py
+++ b/mambular/models/sklearn_base_regressor.py
@@ -168,7 +168,7 @@ def build_model(
             X = pd.DataFrame(X)
         if isinstance(y, pd.Series):
             y = y.values
-        if X_val:
+        if X_val is not None:
             if not isinstance(X_val, pd.DataFrame):
                 X_val = pd.DataFrame(X_val)
             if isinstance(y_val, pd.Series):
@@ -314,7 +314,7 @@ def fit(
                 X = pd.DataFrame(X)
             if isinstance(y, pd.Series):
                 y = y.values
-            if X_val:
+            if X_val is not None:
                 if not isinstance(X_val, pd.DataFrame):
                     X_val = pd.DataFrame(X_val)
                 if isinstance(y_val, pd.Series):

From 628b0a0664343ac738592f7f9229684a1cbbba2b Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Mon, 21 Oct 2024 16:51:56 +0200
Subject: [PATCH 2/2] version increase

---
 mambular/__version__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mambular/__version__.py b/mambular/__version__.py
index 095e93c3..75a44a90 100644
--- a/mambular/__version__.py
+++ b/mambular/__version__.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.2.3"
+__version__ = "0.2.4"