From 1ffcccb21f6b5a52334479e2e311af2532490211 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Fri, 21 Nov 2025 00:04:06 +0100 Subject: [PATCH 01/42] Added function to convert shift to dict that specifies the shift for each input individually --- physXAI/preprocessing/preprocessing.py | 75 ++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index 4754ecc..53faf39 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -12,6 +12,81 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' +def convert_shift_to_dict(s: Union[int, str, dict], inputs: list[str]) -> dict: + """ + Convert a given shift variable into a dictionary in which a shift is defined for every input + + Args: + s (Union[int, str, dict]): Shift value. Either a single string or int which then will be applied to all the inputs or + a dictionary in which a different shift can be defined for each input. If the dictionary does not specify the + shift for all inputs, the shift for inputs not specified is set to 'previous' as default + inputs (list(str)): List of Input variables + """ + + def return_valid_shift(val: Union[int, str]): + """ check the validity of the given shift and return a string if val is int """ + if val in ['current', 0]: + val = 'current' + elif val in ['previous', 1]: + val = 'previous' + elif val == 'mean_over_interval': + val = 'mean_over_interval' + else: + raise ValueError( + f"Value of shift not supported, value is: {val}. Shift must be 'current' (or 0 if s is int), " + f"'previous' (or 1 if s is int) or 'mean_over_interval'.") + return val + + if isinstance(s, Union[int, str]): + d = {} + s = return_valid_shift(s) + + # add shift for each input + for inp in inputs: + d.update({inp: s}) + return d + + elif isinstance(s, dict): + def get_lag(inputs: list[str], current_input: str) -> int: + """ get lag of current input """ + count = 0 + for inp in inputs: + spl = inp.split(current_input) # make sure it is the current input + if spl[0] == '' and spl[1] != '' and spl[1].split('_lag')[0] == '': + count += 1 + return count + + # check if lags exist + d = {} + inputs_without_lags = {} + for inp in inputs: + # skip if current input is just the lag of another inp + if not inp.__contains__('_lag'): + inputs_without_lags.update({inp: get_lag(inputs, inp)}) + + for inp in inputs_without_lags.keys(): + # if an input has a shift assigned already, the validity is checked + # otherwise 'previous' is assigned (default value) + if inp in s.keys(): + d.update({inp: return_valid_shift(s[inp])}) + else: + d.update({inp: 'previous'}) + + # all inputs with lags should have the same shift + if inputs_without_lags[inp] > 0: # if current input has lags + for i in range(inputs_without_lags[inp]): + name = inp + '_lag' + str(i+1) + + # if a shift was already defined for this lag, check if it matches the shift of the original inp + if name in s.keys(): + assert return_valid_shift(s[name]) == d[inp], \ + 'Make sure that all lags of an input have the same shift' + d.update({name: d[inp]}) + return d + else: + raise TypeError(f'shift must be of type int, str or dict, is type {type(s)}') + + class PreprocessingData(ABC): """ Abstract Preprocessing Class From b8706cc9794a2b21fa7efdfa319255e92b4cbbb6 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Fri, 21 Nov 2025 00:05:02 +0100 Subject: [PATCH 02/42] Added unittests for function preprocessing.convert_shift_to_dict --- unittests/test_coverage.py | 69 +++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index e17b448..54bea18 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -1,13 +1,14 @@ import json import os import pathlib +import unittest from unittest.mock import patch import keras import pytest ###################################################################################################################### from physXAI.utils.logging import Logger, get_parent_working_directory from physXAI.preprocessing.preprocessing import PreprocessingSingleStep, PreprocessingMultiStep, \ - PreprocessingData + PreprocessingData, convert_shift_to_dict from physXAI.preprocessing.constructed import Feature, FeatureConstruction, FeatureConstant from physXAI.feature_selection.recursive_feature_elimination import recursive_feature_elimination_pipeline from physXAI.models.models import LinearRegressionModel, AbstractModel @@ -93,6 +94,72 @@ def test_preprocessing_multistep(file_path, inputs_tair, output_tair): overlapping_sequences=False, batch_size=1) prep.pipeline(file_path) +class TestPreprocessingShiftConversion(unittest.TestCase): + + inputs = ['reaTZon_y', 'reaTZon_y_lag1', 'reaTZon_y_lag2', 'weaSta_reaWeaTDryBul_y', 'weaSta_reaWeaTDryBul_y_lag1', + 'weaSta_reaWeaHDirNor_y', 'oveHeaPumY_u', 'oveHeaPumY_u_lag1', 'oveHeaPumY_u_lag2'] + + # test case: int given for shift + def test_int(self): + shift = 0 + res = convert_shift_to_dict(shift, self.inputs) + res_expected = {'reaTZon_y': 'current', 'reaTZon_y_lag1': 'current', 'reaTZon_y_lag2': 'current', + 'weaSta_reaWeaTDryBul_y': 'current', 'weaSta_reaWeaTDryBul_y_lag1': 'current', + 'weaSta_reaWeaHDirNor_y': 'current', 'oveHeaPumY_u': 'current', 'oveHeaPumY_u_lag1': 'current', + 'oveHeaPumY_u_lag2': 'current'} + assert res == res_expected + + # test case: unsupported int given for shift + def test_unsupported_int(self): + shift = 2 + with self.assertRaises(ValueError): + convert_shift_to_dict(shift, self.inputs) + + # test case: str given for shift + def test_str(self): + shift = 'mean_over_interval' + res = convert_shift_to_dict(shift, self.inputs) + res_expected = {'reaTZon_y': 'mean_over_interval', 'reaTZon_y_lag1': 'mean_over_interval', + 'reaTZon_y_lag2': 'mean_over_interval', 'weaSta_reaWeaTDryBul_y': 'mean_over_interval', + 'weaSta_reaWeaTDryBul_y_lag1': 'mean_over_interval', + 'weaSta_reaWeaHDirNor_y': 'mean_over_interval', 'oveHeaPumY_u': 'mean_over_interval', + 'oveHeaPumY_u_lag1': 'mean_over_interval', 'oveHeaPumY_u_lag2': 'mean_over_interval'} + assert res == res_expected + + # test case: unsupported str given for shift + def test_unsupported_str(self): + shift = 'test' + with self.assertRaises(ValueError): + convert_shift_to_dict(shift, self.inputs) + + # test case: unsupported type given for shift + def test_unsupported_type(self): + shift = ['previous'] + with self.assertRaises(TypeError): + convert_shift_to_dict(shift, self.inputs) + + # test case: autocomplete incomplete dictionary given for shift + def test_autocomplete_incomplete_dict(self): + shift = {'reaTZon_y': 0, 'reaTZon_y_lag1': 0, 'weaSta_reaWeaTDryBul_y': 'mean_over_interval'} + + # previous is default for all inputs that are not specified + res = convert_shift_to_dict(shift, self.inputs) + res_expected = {'reaTZon_y': 'current', 'reaTZon_y_lag1': 'current', 'reaTZon_y_lag2': 'current', + 'weaSta_reaWeaTDryBul_y': 'mean_over_interval', + 'weaSta_reaWeaTDryBul_y_lag1': 'mean_over_interval', + 'weaSta_reaWeaHDirNor_y': 'previous', 'oveHeaPumY_u': 'previous', + 'oveHeaPumY_u_lag1': 'previous', + 'oveHeaPumY_u_lag2': 'previous'} + assert len(res) == len(self.inputs) + assert res == res_expected + + # test case: lags of the same input have mismatching shifts + def test_lag_with_mismatching_shifts(self): + shift = {'reaTZon_y': 0, 'reaTZon_y_lag1': 1, 'weaSta_reaWeaTDryBul_y': 'mean_over_interval'} + with self.assertRaises(AssertionError): + convert_shift_to_dict(shift, self.inputs) + + @pytest.fixture(scope='module') def p_hp_data(file_path, inputs_php, output_php): # Setup up logger for saving From 499b4c7b3ccef246a41322a4e01ee6245ba24f2a Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Fri, 21 Nov 2025 00:08:54 +0100 Subject: [PATCH 03/42] Small import improvement --- unittests/test_coverage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index 54bea18..f083a35 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -1,8 +1,8 @@ import json import os import pathlib -import unittest from unittest.mock import patch +from unittest import TestCase import keras import pytest ###################################################################################################################### @@ -94,7 +94,7 @@ def test_preprocessing_multistep(file_path, inputs_tair, output_tair): overlapping_sequences=False, batch_size=1) prep.pipeline(file_path) -class TestPreprocessingShiftConversion(unittest.TestCase): +class TestPreprocessingShiftConversion(TestCase): inputs = ['reaTZon_y', 'reaTZon_y_lag1', 'reaTZon_y_lag2', 'weaSta_reaWeaTDryBul_y', 'weaSta_reaWeaTDryBul_y_lag1', 'weaSta_reaWeaHDirNor_y', 'oveHeaPumY_u', 'oveHeaPumY_u_lag1', 'oveHeaPumY_u_lag2'] From f40439631f5e5637b07efd89c7c4cc0a85229e54 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Fri, 21 Nov 2025 09:18:37 +0100 Subject: [PATCH 04/42] Bug fix for backwards compatibility with python 3.9 --- physXAI/preprocessing/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index 53faf39..c1e14b7 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -23,7 +23,7 @@ def convert_shift_to_dict(s: Union[int, str, dict], inputs: list[str]) -> dict: inputs (list(str)): List of Input variables """ - def return_valid_shift(val: Union[int, str]): + def return_valid_shift(val: (int, str)): """ check the validity of the given shift and return a string if val is int """ if val in ['current', 0]: val = 'current' From 15c2ed7b82055f953e35ab0dbf49e65b66cd5be7 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Fri, 21 Nov 2025 09:25:37 +0100 Subject: [PATCH 05/42] Corrected bug fix for backwards compatibility with python 3.9 --- physXAI/preprocessing/preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index c1e14b7..37c659f 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -23,7 +23,7 @@ def convert_shift_to_dict(s: Union[int, str, dict], inputs: list[str]) -> dict: inputs (list(str)): List of Input variables """ - def return_valid_shift(val: (int, str)): + def return_valid_shift(val: Union[int, str]): """ check the validity of the given shift and return a string if val is int """ if val in ['current', 0]: val = 'current' @@ -37,7 +37,7 @@ def return_valid_shift(val: (int, str)): f"'previous' (or 1 if s is int) or 'mean_over_interval'.") return val - if isinstance(s, Union[int, str]): + if isinstance(s, (int, str)): d = {} s = return_valid_shift(s) From 828c64d0050e9f259afa7a69e9366103164cc975 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Fri, 21 Nov 2025 17:08:45 +0100 Subject: [PATCH 06/42] partly integrated new structure for shifting inputs and outputs --- physXAI/preprocessing/preprocessing.py | 65 ++++++++++++++++++++------ 1 file changed, 51 insertions(+), 14 deletions(-) diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index fe6f1d9..f2fb300 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -1,8 +1,9 @@ import os from abc import ABC, abstractmethod -from typing import Optional, Union +from typing import Optional, Union, Iterable import numpy as np import pandas as pd +import itertools from sklearn.model_selection import train_test_split from physXAI.preprocessing.constructed import FeatureConstruction from physXAI.preprocessing.training_data import TrainingData, TrainingDataMultiStep, TrainingDataGeneric @@ -14,12 +15,13 @@ def convert_shift_to_dict(s: Union[int, str, dict], inputs: list[str]) -> dict: """ - Convert a given shift variable into a dictionary in which a shift is defined for every input + Convert a given shift variable (int, str) into a dictionary in which a shift is defined for every input. + If a dictionary is given as shift, check entries and autocomplete dict if necessary. Args: s (Union[int, str, dict]): Shift value. Either a single string or int which then will be applied to all the inputs or a dictionary in which a different shift can be defined for each input. If the dictionary does not specify the - shift for all inputs, the shift for inputs not specified is set to 'previous' as default + shift for all inputs, the shift for inputs not specified is set to 'previous' as default (autocomplete) inputs (list(str)): List of Input variables """ @@ -92,7 +94,7 @@ class PreprocessingData(ABC): Abstract Preprocessing Class """ - def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: int = 1, + def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: Union[int, str, dict] = 'previous', time_step: Optional[Union[int, float]] = None, test_size: float = 0.1, val_size: float = 0.1, random_state: int = 42, time_index_col: Union[str, float] = 0, csv_delimiter: str = ';', csv_encoding: str = 'latin1', @@ -103,7 +105,7 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: int Args: inputs (List[str]): List of column names to be used as input features. output (Union[str, List[str]]): Column name(s) for the target variable(s). - shift (int): The number of time steps to shift the target variable for forecasting. + shift (int): The number of time steps to shift the target variable for forecasting. # TODO: update docstring A shift of one means predicting the next time step. time_step (Optional[Union[int, float]]): Optional time step sampling. If None, sampling of data is used. test_size (float): Proportion of the dataset to allocate to the test set. @@ -126,7 +128,7 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: int if isinstance(output, str): output = [output] self.output: list[str] = output - self.shift: int = shift + self.shift: dict = convert_shift_to_dict(shift, inputs) self.time_step = time_step # Training, validation and test size should be equal to 1 @@ -202,7 +204,7 @@ class PreprocessingSingleStep(PreprocessingData): validation, and test sets. """ - def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: int = 1, + def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: Union[int, str, dict] = 'previous', time_step: Optional[Union[int, float]] = None, test_size: float = 0.1, val_size: float = 0.1, random_state: int = 42, time_index_col: Union[str, float] = 0, csv_delimiter: str = ';', csv_encoding: str = 'latin1', @@ -213,7 +215,7 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: int Args: inputs (List[str]): List of column names to be used as input features. output (Union[str, List[str]]): Column name(s) for the target variable(s). - shift (int): The number of time steps to shift the target variable for forecasting. + shift (int): The number of time steps to shift the target variable for forecasting. # TODO: update doc dring A shift of one means predicting the next time step. time_step (Optional[Union[int, float]]): Optional time step sampling. If None, sampling of data is used. test_size (float): Proportion of the dataset to allocate to the test set. @@ -257,16 +259,51 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: last_valid_index = non_nan_rows.iloc[::-1].idxmax() if non_nan_rows.any() else None df = df.loc[first_valid_index:last_valid_index] if df.isnull().values.any(): - if self.ignore_nan: + if self.ignore_nan: # TODO: restructure this df.dropna(inplace=True) else: - raise ValueError("Data Error: The TrainingData contains NaN values in intermediate rows. If this is intended, set ignore_nan=True in PreprocessingSingleStep.") + pass # raise ValueError("Data Error: The TrainingData contains NaN values in intermediate rows. If this is intended, set ignore_nan=True in PreprocessingSingleStep.") X = df[self.inputs] - y = df[self.output].shift(-self.shift) - if self.shift > 0: # pragma: no cover - y = y.iloc[:-self.shift] - X = X.iloc[:-self.shift] + y = df[self.output] + + assert len(self.inputs) == len(self.shift.keys()), (f"Something went wrong, number of inputs ({len(self.inputs)})" + f" doesn't match number of inputs defined in shift ({len(self.shift.keys())})") + + if all('current' == self.shift[k] for k in self.shift.keys()): + pass # nothing to do here + elif all('previous' == self.shift[k] for k in self.shift.keys()): + X = X.shift(1) + y = y.iloc[1:] + X = X.iloc[1:] + elif all('mean_over_interval' == self.shift[k] for k in self.shift.keys()): + + # output interval is target grid + y.dropna(inplace=True) + + def pairwise(iterable: Iterable): + "s -> (s0,s1), (s1,s2), (s2, s3), ..." + a, b = itertools.tee(iterable) + next(b, None) + return zip(a, b) + + original_grid = np.array(X.index) + results = [] + for i, j in pairwise(y.index): + slicer = np.logical_and(original_grid >= i, original_grid < j) + d = {'Index': j} + for inp in self.inputs: + d[inp] = X[inp][slicer].mean() + results.append(d) + + # length of X and Y have to be synchronized + y = y.iloc[1:] + X = pd.DataFrame(results).set_index('Index') + + else: # different inputs have different shift + pass + + # y = df[self.output].shift(-self.shift) return X, y From e2986de05f24b2da9213fa114097e1258c77eb51 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 26 Nov 2025 10:02:13 +0100 Subject: [PATCH 07/42] Fixed error occurring with recursive_feature_elimination --- physXAI/preprocessing/preprocessing.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index f2fb300..cc6281c 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -265,7 +265,12 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: pass # raise ValueError("Data Error: The TrainingData contains NaN values in intermediate rows. If this is intended, set ignore_nan=True in PreprocessingSingleStep.") X = df[self.inputs] - y = df[self.output] + y = df[self.output].copy() + + # check if current inputs match inputs (keys) in shift dictionary and update shift if necessary + # required for recursive feature selection since inputs change after initialization of Preprocessing object + if (len(self.inputs) != len(self.shift.keys())) or not all(inp in self.shift.keys() for inp in self.inputs): + self.shift = convert_shift_to_dict(self.shift, self.inputs) assert len(self.inputs) == len(self.shift.keys()), (f"Something went wrong, number of inputs ({len(self.inputs)})" f" doesn't match number of inputs defined in shift ({len(self.shift.keys())})") From 9973f0cde3b2757386ec94afa8ad26aa09989c1e Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 26 Nov 2025 23:25:06 +0100 Subject: [PATCH 08/42] Implemented new structure and methods for shifting input data --- physXAI/preprocessing/constructed.py | 12 +- physXAI/preprocessing/preprocessing.py | 154 ++++++++++++++++++------- 2 files changed, 121 insertions(+), 45 deletions(-) diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index e90b7d3..c481277 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -555,7 +555,7 @@ def get_feature(name: str) -> Union[FeatureBase, None]: return None @staticmethod - def process(df: DataFrame): + def process(df: DataFrame, feature_names: list[str] = None): """ Processes the input DataFrame by applying all registered feature transformations in order. Each feature's `process` method is called, which typically adds a new column to `df` @@ -563,10 +563,16 @@ def process(df: DataFrame): Args: df (DataFrame): The DataFrame to process and add features to. + feature_names (list[str]): optional parameter to only process those features given in feature_names """ - for f in FeatureConstruction.features: - f.process(df) + if not feature_names: + for f in FeatureConstruction.features: + f.process(df) + else: + for f in FeatureConstruction.features: + if f.feature in feature_names: + f.process(df) @staticmethod def get_config() -> list: diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index cc6281c..6e39a7e 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -5,7 +5,7 @@ import pandas as pd import itertools from sklearn.model_selection import train_test_split -from physXAI.preprocessing.constructed import FeatureConstruction +from physXAI.preprocessing.constructed import FeatureConstruction, FeatureLag from physXAI.preprocessing.training_data import TrainingData, TrainingDataMultiStep, TrainingDataGeneric from physXAI.utils.logging import get_full_path os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' @@ -168,11 +168,14 @@ def load_data(self, file_path: str) -> pd.DataFrame: else: assert self.time_step % time_step == 0, (f"Value Error: Given time step {self.time_step} is not a multiple " f"of data time step: {time_step}.") - filtering = (df.index - df.index[0]) % self.time_step == 0 - df = df[filtering] return df + def filter_df_according_to_timestep(self, df: pd.DataFrame): + filtering = (df.index - df.index[0]) % self.time_step == 0 + df = df[filtering] + return df + @abstractmethod def pipeline(self, file_path: str) -> TrainingDataGeneric: """ @@ -248,67 +251,131 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: and target (y) DataFrames. """ - # Applies feature constructions defined in `FeatureConstruction`. - FeatureConstruction.process(df) - - df = df[self.inputs + [out for out in self.output if out not in self.inputs]] - - # Nan handling - non_nan_rows = df.notna().all(axis=1) - first_valid_index = non_nan_rows.idxmax() if non_nan_rows.any() else None - last_valid_index = non_nan_rows.iloc[::-1].idxmax() if non_nan_rows.any() else None - df = df.loc[first_valid_index:last_valid_index] - if df.isnull().values.any(): - if self.ignore_nan: # TODO: restructure this - df.dropna(inplace=True) - else: - pass # raise ValueError("Data Error: The TrainingData contains NaN values in intermediate rows. If this is intended, set ignore_nan=True in PreprocessingSingleStep.") - - X = df[self.inputs] - y = df[self.output].copy() - # check if current inputs match inputs (keys) in shift dictionary and update shift if necessary # required for recursive feature selection since inputs change after initialization of Preprocessing object if (len(self.inputs) != len(self.shift.keys())) or not all(inp in self.shift.keys() for inp in self.inputs): self.shift = convert_shift_to_dict(self.shift, self.inputs) - assert len(self.inputs) == len(self.shift.keys()), (f"Something went wrong, number of inputs ({len(self.inputs)})" - f" doesn't match number of inputs defined in shift ({len(self.shift.keys())})") + assert len(self.inputs) == len(self.shift.keys()), ( + f"Something went wrong, number of inputs ({len(self.inputs)})" + f" doesn't match number of inputs defined in shift ({len(self.shift.keys())})") - if all('current' == self.shift[k] for k in self.shift.keys()): - pass # nothing to do here - elif all('previous' == self.shift[k] for k in self.shift.keys()): - X = X.shift(1) - y = y.iloc[1:] - X = X.iloc[1:] - elif all('mean_over_interval' == self.shift[k] for k in self.shift.keys()): + # extract the names of lagged inputs + lagged_inputs = [] + for f in FeatureConstruction.features: + if isinstance(f, FeatureLag): + lagged_inputs.append(f.feature) # name of the feature - # output interval is target grid - y.dropna(inplace=True) + inputs_without_lags = [inp for inp in self.inputs if inp not in lagged_inputs] + + # Applies feature constructions defined in `FeatureConstruction`. + # Only apply for those features that are not lags since lags must be constructed after sampling the data + # according to the given time step + FeatureConstruction.process(df, feature_names=inputs_without_lags) + df = df[inputs_without_lags + [out for out in self.output if out not in inputs_without_lags]] + + # Nan handling in first and last rows + non_nan_rows = df.notna().all(axis=1) + first_valid_index = non_nan_rows.idxmax() if non_nan_rows.any() else None + last_valid_index = non_nan_rows.iloc[::-1].idxmax() if non_nan_rows.any() else None + df = df.loc[first_valid_index:last_valid_index] + + def get_mean_over_interval(y: pd.DataFrame, x: pd.DataFrame, inputs: list[str]): def pairwise(iterable: Iterable): "s -> (s0,s1), (s1,s2), (s2, s3), ..." a, b = itertools.tee(iterable) next(b, None) return zip(a, b) - original_grid = np.array(X.index) + original_grid = np.array(x.index) results = [] - for i, j in pairwise(y.index): + for i, j in pairwise(y.index): # output interval is target grid slicer = np.logical_and(original_grid >= i, original_grid < j) d = {'Index': j} - for inp in self.inputs: - d[inp] = X[inp][slicer].mean() + for inp in inputs: + d[inp] = x[inp][slicer].mean() results.append(d) - # length of X and Y have to be synchronized + x = pd.DataFrame(results).set_index('Index') + + return x + + # output is independent of shift -> filter / sample according to time step already + y = df[self.output].copy() + y = self.filter_df_according_to_timestep(y) + + X = df[inputs_without_lags].copy() + + if all('current' == self.shift[k] for k in inputs_without_lags): + # filter / sample data + X = self.filter_df_according_to_timestep(X) + # nothing more to do here + elif all('previous' == self.shift[k] for k in inputs_without_lags): + # filter / sample data + X = self.filter_df_according_to_timestep(X) + + # shift data by 1 and shorten DataFrames accordingly + X = X.shift(1) + y = y.iloc[1:] + X = X.iloc[1:] + elif all('mean_over_interval' == self.shift[k] for k in inputs_without_lags): + X = get_mean_over_interval(y, X, inputs_without_lags) + # synchronize length between X and y y = y.iloc[1:] - X = pd.DataFrame(results).set_index('Index') - else: # different inputs have different shift - pass + else: # different inputs have different shifts + res = [] + for inp in inputs_without_lags: + # only process inputs with shift method mean_over_interval first since X cannot be filtered / sampled + # to the actual required time steps until the intermediate values were taken into the mean + if self.shift[inp] == 'mean_over_interval': + res.append(get_mean_over_interval(y, X[[inp]], [inp])) + + # filter / sample X according to required time step + X = self.filter_df_according_to_timestep(X) + # process inputs with shift methods 'current' and 'previous' + for inp in inputs_without_lags: + _x = X[[inp]] + if self.shift[inp] == 'current': + # no transformation needed + res.append(_x) + elif self.shift[inp] == 'previous': + # shift by 1 + _x = _x.shift(1) + _x = _x.iloc[1:] + res.append(_x) + elif self.shift[inp] == 'mean_over_interval': + continue + else: + raise NotImplementedError(f"Shift method '{self.shift[inp]}' not implemented.") + + X = pd.concat(res, axis=1) + + # Shift methods 'previous' and 'mean_over_interval' reduce available data points by 1. + # Therefore, length of X and y have to be synchronized + if 'previous' in self.shift.values() or 'mean_over_interval' in self.shift.values(): + y = y.iloc[1:] + X = X.sort_index(ascending=True) + X = X.iloc[1:] + + res_df = pd.concat([X, y], axis=1) + + if res_df.isnull().values.any(): + if self.ignore_nan: + res_df.dropna(inplace=True) + else: + raise ValueError( + "Data Error: The TrainingData contains NaN values in intermediate rows. If this is intended, set ignore_nan=True in PreprocessingSingleStep.") + + # Applies feature constructions defined in `FeatureConstruction` to the lagged inputs + FeatureConstruction.process(res_df, feature_names=lagged_inputs) - # y = df[self.output].shift(-self.shift) + # drop NaNs occurring due to creation of lags + res_df.dropna(inplace=True) + + X = res_df[self.inputs] + y = res_df[self.output] return X, y @@ -469,6 +536,9 @@ def process_data(self, df: pd.DataFrame) -> TrainingDataMultiStep: TrainingDataMultiStep: Container with tf.data.Dataset objects. """ + # filter data + df = self.filter_df_according_to_timestep(df) + # Applies feature constructions defined in `FeatureConstruction`. FeatureConstruction.process(df) From 0d3783bf9c2aff7a2d6a332d211e808fc768deb1 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Fri, 28 Nov 2025 12:02:23 +0100 Subject: [PATCH 09/42] Fixed small error with feature selection test script --- physXAI/preprocessing/constructed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index c481277..590b694 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -566,7 +566,7 @@ def process(df: DataFrame, feature_names: list[str] = None): feature_names (list[str]): optional parameter to only process those features given in feature_names """ - if not feature_names: + if feature_names is None: for f in FeatureConstruction.features: f.process(df) else: From 2d23228a1b4aa7274bd8f753e02e6f17f82c1954 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Sat, 29 Nov 2025 00:56:41 +0100 Subject: [PATCH 10/42] Fixed error in feature construction output wasn't considered, error occurred when using pinn --- physXAI/preprocessing/preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index 6e39a7e..a173b6a 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -263,7 +263,7 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: # extract the names of lagged inputs lagged_inputs = [] for f in FeatureConstruction.features: - if isinstance(f, FeatureLag): + if isinstance(f, FeatureLag) and (f.feature in (self.inputs + self.output)): lagged_inputs.append(f.feature) # name of the feature inputs_without_lags = [inp for inp in self.inputs if inp not in lagged_inputs] @@ -271,7 +271,7 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: # Applies feature constructions defined in `FeatureConstruction`. # Only apply for those features that are not lags since lags must be constructed after sampling the data # according to the given time step - FeatureConstruction.process(df, feature_names=inputs_without_lags) + FeatureConstruction.process(df, feature_names=inputs_without_lags + [out for out in self.output if out not in inputs_without_lags]) df = df[inputs_without_lags + [out for out in self.output if out not in inputs_without_lags]] From 5ba2f224cd0218ba5ecb21c2b9f71e29a15b8b53 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Sat, 29 Nov 2025 00:00:04 +0000 Subject: [PATCH 11/42] Update coverage badge [skip ci] --- build/reports/coverage.svg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build/reports/coverage.svg b/build/reports/coverage.svg index c149003..1c7007c 100644 --- a/build/reports/coverage.svg +++ b/build/reports/coverage.svg @@ -9,13 +9,13 @@ - + coverage coverage - 90% - 90% + 89% + 89% From 3dec440b94b2db71ead112488c0646ea3eecdf48 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 3 Dec 2025 08:07:48 +0000 Subject: [PATCH 12/42] Update coverage badge [skip ci] --- build/reports/coverage.svg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/reports/coverage.svg b/build/reports/coverage.svg index 1c7007c..b3e8ba0 100644 --- a/build/reports/coverage.svg +++ b/build/reports/coverage.svg @@ -15,7 +15,7 @@ coverage coverage - 89% - 89% + 88% + 88% From 88b1ccc9fac20f99793f7b9cee65da0038f98ae0 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 3 Dec 2025 10:29:36 +0100 Subject: [PATCH 13/42] implemented custom default for shift --- physXAI/preprocessing/preprocessing.py | 23 +++++++++++++++++------ unittests/test_coverage.py | 15 +++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index a173b6a..f961074 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -13,7 +13,7 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' -def convert_shift_to_dict(s: Union[int, str, dict], inputs: list[str]) -> dict: +def convert_shift_to_dict(s: Union[int, str, dict], inputs: list[str], custom_default: Union[int, str] = None) -> dict: """ Convert a given shift variable (int, str) into a dictionary in which a shift is defined for every input. If a dictionary is given as shift, check entries and autocomplete dict if necessary. @@ -21,8 +21,9 @@ def convert_shift_to_dict(s: Union[int, str, dict], inputs: list[str]) -> dict: Args: s (Union[int, str, dict]): Shift value. Either a single string or int which then will be applied to all the inputs or a dictionary in which a different shift can be defined for each input. If the dictionary does not specify the - shift for all inputs, the shift for inputs not specified is set to 'previous' as default (autocomplete) + shift for all inputs, the shift for inputs not specified is set to the default value (autocomplete) inputs (list(str)): List of Input variables + custom_default (Union[int, str]): if no custom default is specified, 'previous' is used as default shift """ def return_valid_shift(val: Union[int, str]): @@ -39,6 +40,9 @@ def return_valid_shift(val: Union[int, str]): f"'previous' (or 1 if s is int) or 'mean_over_interval'.") return val + # set custom default or - if no custom default is specified - use 'previous' as default + default = 'previous' if custom_default is None else return_valid_shift(custom_default) + if isinstance(s, (int, str)): d = {} s = return_valid_shift(s) @@ -68,11 +72,11 @@ def get_lag(inputs: list[str], current_input: str) -> int: for inp in inputs_without_lags.keys(): # if an input has a shift assigned already, the validity is checked - # otherwise 'previous' is assigned (default value) + # otherwise default value is assigned if inp in s.keys(): d.update({inp: return_valid_shift(s[inp])}) else: - d.update({inp: 'previous'}) + d.update({inp: default}) # all inputs with lags should have the same shift if inputs_without_lags[inp] > 0: # if current input has lags @@ -128,7 +132,14 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: Unio if isinstance(output, str): output = [output] self.output: list[str] = output - self.shift: dict = convert_shift_to_dict(shift, inputs) + + if isinstance(shift, dict) and '_default' in shift.keys(): + self.shift_default = shift['_default'] + shift.__delitem__('_default') + else: + self.shift_default = None + self.shift: dict = convert_shift_to_dict(shift, inputs, custom_default=self.shift_default) + self.time_step = time_step # Training, validation and test size should be equal to 1 @@ -254,7 +265,7 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: # check if current inputs match inputs (keys) in shift dictionary and update shift if necessary # required for recursive feature selection since inputs change after initialization of Preprocessing object if (len(self.inputs) != len(self.shift.keys())) or not all(inp in self.shift.keys() for inp in self.inputs): - self.shift = convert_shift_to_dict(self.shift, self.inputs) + self.shift = convert_shift_to_dict(self.shift, self.inputs, custom_default=self.shift_default) assert len(self.inputs) == len(self.shift.keys()), ( f"Something went wrong, number of inputs ({len(self.inputs)})" diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index f083a35..e968034 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -153,6 +153,21 @@ def test_autocomplete_incomplete_dict(self): assert len(res) == len(self.inputs) assert res == res_expected + # test case: autocomplete incomplete dictionary given for shift with custom default + def test_autocomplete_incomplete_dict_with_custom_default(self): + shift = {'reaTZon_y': 1, 'reaTZon_y_lag1': 1, 'weaSta_reaWeaTDryBul_y': 'mean_over_interval'} + + # previous is default for all inputs that are not specified + res = convert_shift_to_dict(shift, self.inputs, custom_default=0) + res_expected = {'reaTZon_y': 'previous', 'reaTZon_y_lag1': 'previous', 'reaTZon_y_lag2': 'previous', + 'weaSta_reaWeaTDryBul_y': 'mean_over_interval', + 'weaSta_reaWeaTDryBul_y_lag1': 'mean_over_interval', + 'weaSta_reaWeaHDirNor_y': 'current', 'oveHeaPumY_u': 'current', + 'oveHeaPumY_u_lag1': 'current', + 'oveHeaPumY_u_lag2': 'current'} + assert len(res) == len(self.inputs) + assert res == res_expected + # test case: lags of the same input have mismatching shifts def test_lag_with_mismatching_shifts(self): shift = {'reaTZon_y': 0, 'reaTZon_y_lag1': 1, 'weaSta_reaWeaTDryBul_y': 'mean_over_interval'} From ef9a8d4a0b2cc00a0de934307972d733ead3403f Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 3 Dec 2025 11:15:54 +0100 Subject: [PATCH 14/42] Updated docstrings --- physXAI/preprocessing/preprocessing.py | 30 ++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index f961074..210ab0a 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -109,8 +109,19 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: Unio Args: inputs (List[str]): List of column names to be used as input features. output (Union[str, List[str]]): Column name(s) for the target variable(s). - shift (int): The number of time steps to shift the target variable for forecasting. # TODO: update docstring - A shift of one means predicting the next time step. + shift (Union[int, str, dict]): Time step of the input data used to predict the output. + - If a single int or str is given, it applies to all inputs. + - If a dict is provided, it can specify different shifts for individual inputs. + - If not all inputs are specified in the dict, unspecified inputs will use a default value (autocomplete). + Examples: + - shift = 0 or shift = 'current': Current time step will be used for prediction. + - shift = 1 or shift = 'previous': Previous values will be used for prediction. + - shift = 'mean_over_interval': Mean between current and previous time step will be used. + - shift = { + 'inp_1': 1, + 'inp_2': 'mean_over_interval', + '_default': 0, # current time step will be used for all inputs not specified in the dict + } time_step (Optional[Union[int, float]]): Optional time step sampling. If None, sampling of data is used. test_size (float): Proportion of the dataset to allocate to the test set. val_size (float): Proportion of the dataset to allocate to the validation set. @@ -229,8 +240,19 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: Unio Args: inputs (List[str]): List of column names to be used as input features. output (Union[str, List[str]]): Column name(s) for the target variable(s). - shift (int): The number of time steps to shift the target variable for forecasting. # TODO: update doc dring - A shift of one means predicting the next time step. + shift (Union[int, str, dict]): Time step of the input data used to predict the output. + - If a single int or str is given, it applies to all inputs. + - If a dict is provided, it can specify different shifts for individual inputs. + - If not all inputs are specified in the dict, unspecified inputs will use a default value (autocomplete). + Examples: + - shift = 0 or shift = 'current': Current time step will be used for prediction. + - shift = 1 or shift = 'previous': Previous values will be used for prediction. + - shift = 'mean_over_interval': Mean between current and previous time step will be used. + - shift = { + 'inp_1': 1, + 'inp_2': 'mean_over_interval', + '_default': 0, # current time step will be used for all inputs not specified in the dict + } time_step (Optional[Union[int, float]]): Optional time step sampling. If None, sampling of data is used. test_size (float): Proportion of the dataset to allocate to the test set. val_size (float): Proportion of the dataset to allocate to the validation set. From b4588036227b5d2fa0a60c95663c16750111bee7 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 3 Dec 2025 15:49:19 +0100 Subject: [PATCH 15/42] Implemented test and example for different shifts updated docstrings --- .../Dummy_shifting.py | 73 +++++++++++++++++++ physXAI/preprocessing/preprocessing.py | 15 ++-- unittests/test_coverage.py | 31 ++++++++ 3 files changed, 113 insertions(+), 6 deletions(-) create mode 100644 executables/bestest_hydronic_heat_pump/Dummy_shifting.py diff --git a/executables/bestest_hydronic_heat_pump/Dummy_shifting.py b/executables/bestest_hydronic_heat_pump/Dummy_shifting.py new file mode 100644 index 0000000..340dd75 --- /dev/null +++ b/executables/bestest_hydronic_heat_pump/Dummy_shifting.py @@ -0,0 +1,73 @@ +from physXAI.models.ann.ann_design import ClassicalANNModel +from physXAI.preprocessing.preprocessing import PreprocessingSingleStep +from physXAI.preprocessing.constructed import Feature +from physXAI.utils.logging import Logger + + +""" +This script demonstrates the usage of different shifts. It is not physically meaningful. +""" +# Setup up logger for saving +Logger.setup_logger(folder_name='Dummy_shifting_ann', override=True) + +# File path to data +file_path = r"data/bestest_hydronic_heat_pump/pid_data.csv" + +# List of input features. Can include constructed features and lagged inputs +inputs = ['reaTZon_y', 'reaTZon_y_lag1', 'reaTZon_y_lag2', 'weaSta_reaWeaTDryBul_y', 'weaSta_reaWeaTDryBul_y_lag1', + 'weaSta_reaWeaHDirNor_y', 'oveHeaPumY_u', 'oveHeaPumY_u_lag1', 'oveHeaPumY_u_lag2'] +# Output feature +output = 'Change(T_zone)' + +""" +The constructed features are automatically added to the data via 'physXAI.preprocessing.constructed.py' +Lagged inputs can be added directly based on the feature +""" +x1 = Feature('reaTZon_y') +x1.lag(2) # reaTZon_y_lag1, reaTZon_y_lag2 +x2 = Feature('weaSta_reaWeaTDryBul_y') +x2.lag(1) # weaSta_reaWeaTDryBul_y_lag1 +x3 = Feature('oveHeaPumY_u') +x3.lag(2) # oveHeaPumY_u_lag1, oveHeaPumY_u_lag2 + + +""" +shift (Union[int, str, dict]): Time step of the input data used to predict the output. + - If a single int or str is given, it applies to all inputs. + - If a dict is provided, it can specify different shifts for individual inputs. + - If not all inputs are specified in the dict, unspecified inputs will use a default value (autocomplete). + Examples: + - shift = 0 or shift = 'current': Current time step will be used for prediction. + - shift = 1 or shift = 'previous': Previous values will be used for prediction. + - shift = 'mean_over_interval': Mean between current and previous time step will be used. + - shift = { + 'inp_1': 1, + 'inp_2': 'mean_over_interval', + '_default': 0, # current time step will be used for all inputs not specified in the dict + # If no custom default value is given in dict, 'previous' will be used as default + } +""" +shift = { + 'reaTZon_y': 'previous', # for all lags of reaTZon_y, the shift will be set automatically + 'weaSta_reaWeaHDirNor_y': 'mean_over_interval', + '_default': 0, +} + +# Create Training data +# Time step defines target sampling: if original sampling of data is in 15min intervals, it is resampled to 1h intervals for time_step=4 +# Hence, if the shift method of an input is defined as 'mean_over_interval', the mean over the last hour is taken as input +prep = PreprocessingSingleStep(inputs, output, shift=shift, time_step=4) + +# Process Training data +td = prep.pipeline(file_path) + +# Classical ANN +m = ClassicalANNModel(epochs=500) + +# Training pipeline +model = m.pipeline(td) + +# Log setup of preprocessing and model as json +Logger.log_setup(prep, m) +# Log training data as pickle +Logger.save_training_data(td) \ No newline at end of file diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index 210ab0a..1ad0f6e 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -121,6 +121,7 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: Unio 'inp_1': 1, 'inp_2': 'mean_over_interval', '_default': 0, # current time step will be used for all inputs not specified in the dict + # If no custom default value is given in dict, 'previous' will be used as default } time_step (Optional[Union[int, float]]): Optional time step sampling. If None, sampling of data is used. test_size (float): Proportion of the dataset to allocate to the test set. @@ -252,6 +253,7 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: Unio 'inp_1': 1, 'inp_2': 'mean_over_interval', '_default': 0, # current time step will be used for all inputs not specified in the dict + # If no custom default value is given in dict, 'previous' will be used as default } time_step (Optional[Union[int, float]]): Optional time step sampling. If None, sampling of data is used. test_size (float): Proportion of the dataset to allocate to the test set. @@ -274,7 +276,7 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: 1. Applies feature constructions defined in `FeatureConstruction`. 2. Selects relevant input and output columns. 3. Handles missing values by dropping rows. - 4. Shifts the target variable(s) `y` for forecasting. + 4. Applies the shift on each input variable. Args: df (pd.DataFrame): The input DataFrame. @@ -314,7 +316,8 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: last_valid_index = non_nan_rows.iloc[::-1].idxmax() if non_nan_rows.any() else None df = df.loc[first_valid_index:last_valid_index] - def get_mean_over_interval(y: pd.DataFrame, x: pd.DataFrame, inputs: list[str]): + def get_mean_over_interval(y: pd.DataFrame, x: pd.DataFrame): + """return mean values of x on target sampling (index of y)""" def pairwise(iterable: Iterable): "s -> (s0,s1), (s1,s2), (s2, s3), ..." a, b = itertools.tee(iterable) @@ -326,7 +329,7 @@ def pairwise(iterable: Iterable): for i, j in pairwise(y.index): # output interval is target grid slicer = np.logical_and(original_grid >= i, original_grid < j) d = {'Index': j} - for inp in inputs: + for inp in x.columns: d[inp] = x[inp][slicer].mean() results.append(d) @@ -353,7 +356,7 @@ def pairwise(iterable: Iterable): y = y.iloc[1:] X = X.iloc[1:] elif all('mean_over_interval' == self.shift[k] for k in inputs_without_lags): - X = get_mean_over_interval(y, X, inputs_without_lags) + X = get_mean_over_interval(y, X) # synchronize length between X and y y = y.iloc[1:] @@ -363,7 +366,7 @@ def pairwise(iterable: Iterable): # only process inputs with shift method mean_over_interval first since X cannot be filtered / sampled # to the actual required time steps until the intermediate values were taken into the mean if self.shift[inp] == 'mean_over_interval': - res.append(get_mean_over_interval(y, X[[inp]], [inp])) + res.append(get_mean_over_interval(y, X[[inp]])) # filter / sample X according to required time step X = self.filter_df_according_to_timestep(X) @@ -386,7 +389,7 @@ def pairwise(iterable: Iterable): X = pd.concat(res, axis=1) # Shift methods 'previous' and 'mean_over_interval' reduce available data points by 1. - # Therefore, length of X and y have to be synchronized + # Therefore, lengths of X and y have to be synchronized if 'previous' in self.shift.values() or 'mean_over_interval' in self.shift.values(): y = y.iloc[1:] X = X.sort_index(ascending=True) diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index e968034..49c6099 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -37,6 +37,11 @@ def inputs_php(): def inputs_tair(): return ['reaTZon_y', 'weaSta_reaWeaTDryBul_y', 'oveHeaPumY_u', 'oveHeaPumY_u_lag1'] +@pytest.fixture(scope='module') +def inputs_tair_extended(): + return ['reaTZon_y', 'reaTZon_y_lag1', 'reaTZon_y_lag2', 'weaSta_reaWeaTDryBul_y', 'weaSta_reaWeaTDryBul_y_lag1', + 'weaSta_reaWeaHDirNor_y', 'oveHeaPumY_u', 'oveHeaPumY_u_lag1', 'oveHeaPumY_u_lag2'] + @pytest.fixture(scope='module') def output_php(): return 'reaPHeaPum_y' @@ -227,6 +232,32 @@ def tair_data_total(file_path, inputs_tair, output_tair): td = prep.pipeline(file_path) return prep, td +def test_shifting(file_path, inputs_tair_extended, output_tair): + # Setup up logger for saving + Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) + + # Create lags + x1 = Feature('reaTZon_y') + x1.lag(2) # reaTZon_y_lag1, reaTZon_y_lag2 + x2 = Feature('weaSta_reaWeaTDryBul_y') + x2.lag(1) # weaSta_reaWeaTDryBul_y_lag1 + x3 = Feature('oveHeaPumY_u') + x3.lag(2) # oveHeaPumY_u_lag1, oveHeaPumY_u_lag2 + + shift = { + 'reaTZon_y': 'previous', # for all lags of reaTZon_y, the shift will be set automatically + 'weaSta_reaWeaHDirNor_y': 'mean_over_interval', + '_default': 0, + } + + # Create & process Training data + prep = PreprocessingSingleStep(inputs_tair_extended, output_tair, shift=shift, time_step=4) + td = prep.pipeline(file_path) + + # Build & train Classical ANN + m = ClassicalANNModel(epochs=100) + model = m.pipeline(td) + def test_model_linReg(inputs_php, output_php, file_path): # Setup up logger for saving Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) From 23ac15bb17078960058b8560210aa0cdd8ef861e Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 3 Dec 2025 14:53:41 +0000 Subject: [PATCH 16/42] Update coverage badge [skip ci] --- build/reports/coverage.svg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build/reports/coverage.svg b/build/reports/coverage.svg index b3e8ba0..c149003 100644 --- a/build/reports/coverage.svg +++ b/build/reports/coverage.svg @@ -9,13 +9,13 @@ - + coverage coverage - 88% - 88% + 90% + 90% From 80815ee97acc79bcdc2564a7cce533824814d1df Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 3 Dec 2025 15:59:52 +0100 Subject: [PATCH 17/42] reduce number of epochs for more efficient testing --- executables/bestest_hydronic_heat_pump/Dummy_shifting.py | 2 +- unittests/test_coverage.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/executables/bestest_hydronic_heat_pump/Dummy_shifting.py b/executables/bestest_hydronic_heat_pump/Dummy_shifting.py index 340dd75..48859bb 100644 --- a/executables/bestest_hydronic_heat_pump/Dummy_shifting.py +++ b/executables/bestest_hydronic_heat_pump/Dummy_shifting.py @@ -62,7 +62,7 @@ td = prep.pipeline(file_path) # Classical ANN -m = ClassicalANNModel(epochs=500) +m = ClassicalANNModel(epochs=50) # Training pipeline model = m.pipeline(td) diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index 49c6099..54ba351 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -255,7 +255,7 @@ def test_shifting(file_path, inputs_tair_extended, output_tair): td = prep.pipeline(file_path) # Build & train Classical ANN - m = ClassicalANNModel(epochs=100) + m = ClassicalANNModel(epochs=1) model = m.pipeline(td) def test_model_linReg(inputs_php, output_php, file_path): From 367624fb13ce4dabb6cddfd6663e89e69769defa Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Fri, 5 Dec 2025 11:11:38 +0100 Subject: [PATCH 18/42] implemented handling of constructed features including lagged features fixing review issue https://github.com/RWTH-EBC/physXAI/pull/51#discussion_r2589444241 --- physXAI/preprocessing/constructed.py | 41 ++++++++++++++++++++++++++ physXAI/preprocessing/preprocessing.py | 11 +++---- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index 975b3e9..df88405 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -559,6 +559,47 @@ def get_feature(name: str) -> Union[FeatureBase, None]: return f return None + @staticmethod + def get_features_including_lagged_features(l: list[str] = None) -> list[str]: + """ + returns a list of the names of all FeatureLag and FeatureTwo where at least one feature is a FeatureLag + - within the given list or + - of all constructed features if list is None + + Args: + l (list[str]): list of feature names to search in + + Returns: + list[str]: the list of lag-based features + """ + + # if no list is given, search in all features + if not l: + l = FeatureConstruction.features + + def recursive_search(feature): + """Recursively checks for lagged features""" + if isinstance(feature, FeatureLag): + return True + + elif isinstance(feature, FeatureTwo): + # Check both sub-features recursively + return recursive_search(feature.feature1) or recursive_search(feature.feature2) + + return False + + res = list() + for f in FeatureConstruction.features: + if isinstance(f, FeatureLag) and (f.feature in l): + res.append(f.feature) # name of the feature + + elif isinstance(f, FeatureTwo) and (f.feature in l): + # Use recursive search to check for nested lagged features + if recursive_search(f.feature1) or recursive_search(f.feature2): + res.append(f.feature) + + return res + @staticmethod def process(df: DataFrame, feature_names: list[str] = None): """ diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index 1ad0f6e..cdf26db 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -295,13 +295,10 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: f"Something went wrong, number of inputs ({len(self.inputs)})" f" doesn't match number of inputs defined in shift ({len(self.shift.keys())})") - # extract the names of lagged inputs - lagged_inputs = [] - for f in FeatureConstruction.features: - if isinstance(f, FeatureLag) and (f.feature in (self.inputs + self.output)): - lagged_inputs.append(f.feature) # name of the feature + # extract the names of all features in inputs and outputs that are based on lagged features + lag_based_features = FeatureConstruction.get_features_including_lagged_features(self.inputs + self.output) - inputs_without_lags = [inp for inp in self.inputs if inp not in lagged_inputs] + inputs_without_lags = [inp for inp in self.inputs if inp not in lag_based_features] # Applies feature constructions defined in `FeatureConstruction`. # Only apply for those features that are not lags since lags must be constructed after sampling the data @@ -405,7 +402,7 @@ def pairwise(iterable: Iterable): "Data Error: The TrainingData contains NaN values in intermediate rows. If this is intended, set ignore_nan=True in PreprocessingSingleStep.") # Applies feature constructions defined in `FeatureConstruction` to the lagged inputs - FeatureConstruction.process(res_df, feature_names=lagged_inputs) + FeatureConstruction.process(res_df, feature_names=lag_based_features) # drop NaNs occurring due to creation of lags res_df.dropna(inplace=True) From 567efcdb7ba1ce42bd738f7552b0dbcbd5f56029 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Fri, 5 Dec 2025 17:31:27 +0100 Subject: [PATCH 19/42] Partly integrated shift as attribute sampling_method in Feature --- physXAI/preprocessing/constructed.py | 98 ++++++++++++++++++++++---- physXAI/preprocessing/preprocessing.py | 72 +++++++++---------- 2 files changed, 120 insertions(+), 50 deletions(-) diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index df88405..a9e0b7a 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -4,6 +4,20 @@ from pandas import DataFrame, Series +def _return_valid_sampling_method(v: Union[int, str]): + """ check the validity of the given sampling method and return a string if val is int """ + if v in ['current', 0]: + return 'current' + elif v in ['previous', 1]: + return 'previous' + elif v == 'mean_over_interval': + return 'mean_over_interval' + else: + raise ValueError( + f"Value of sampling method not supported, value is: {v}. Sampling method must be 'current' " + f"(or 0 if s is int), 'previous' (or 1 if s is int) or 'mean_over_interval'.") + + class FeatureBase(ABC): """ Abstract Base Class for all feature engineering components. @@ -11,20 +25,45 @@ class FeatureBase(ABC): in a Pandas DataFrame. It supports arithmetic operations to combine features. """ - def __init__(self, name: str, **kwargs): + def __init__(self, name: str, sampling_method: Union[str, int] = None, **kwargs): """ Initializes a FeatureBase instance. Args: name (str): The name of the feature. This will be the column name in the DataFrame. + sampling_method (Union[str, int]): Time step of the input data used to predict the output. + - if None: FeatureConstruction._default_sampling_method is used + - if 'current' or 0: Current time step will be used for prediction. + - if 'previous' or 1: Previous time step will be used for prediction. + - if 'mean_over_interval': Mean between current and previous time step will be used. **kwargs: Catches any additional keyword arguments. """ self.feature: str = name + self.sampling_method = sampling_method # Automatically registers the newly created feature instance with the FeatureConstruction manager FeatureConstruction.append(self) + @property + def sampling_method(self): + return self._sampling_method + + @sampling_method.setter + def sampling_method(self, val: Union[str, int] = None): + """ + Sets the feature's sampling method. If None is given, FeatureConstruction._default_sampling_method is used + Available methods: + - 'current' or 0: Current time step will be used for prediction. + - 'previous' or 1: Previous time step will be used for prediction. + - 'mean_over_interval': Mean between current and previous time step will be used. + """ + + if val is None: + self._sampling_method = FeatureConstruction.get_default_sampling_method() + else: + self._sampling_method = _return_valid_sampling_method(val) + def rename(self, name: str): """ Renames the feature. @@ -103,7 +142,8 @@ def lag(self, lag: int, previous: bool = True): FeatureLag object for the specified lag_value. Returns: - FeatureLag or List[FeatureLag]: A single lagged feature or a list of lagged features. + FeatureLag or List[FeatureLag]: A single lagged feature or a list of lagged features, each with the same + sampling method as their corresponding base feature. """ if previous and lag > 1: @@ -115,8 +155,11 @@ def lag(self, lag: int, previous: bool = True): return FeatureLag(self, lag) def get_config(self) -> dict: - return {'class_name': self.__class__.__name__, - 'name': self.feature} + return { + 'class_name': self.__class__.__name__, + 'name': self.feature, + 'sampling_method': self.sampling_method, + } @classmethod def from_config(cls, config: dict) -> 'FeatureBase': @@ -189,11 +232,27 @@ def __init__(self, f: Union[FeatureBase, str], lag: int, name: str = None, **kwa """ if isinstance(f, FeatureBase): self.origf: str = f.feature + if name is None: + name = f.feature + f'_lag{lag}' + + # lags must have the same sampling_method as their base feature + sampling_method = f.sampling_method else: self.origf: str = f - if name is None: - name = f.feature + f'_lag{lag}' - super().__init__(name) + if name is None: + name = f + f'_lag{lag}' + + # lags must have the same sampling_method as their base feature + sampling_method = FeatureConstruction.get_feature(f).sampling_method + + if 'sampling_method' in kwargs.keys(): + assert kwargs['sampling_method'] == sampling_method, (f'lags must have the same sampling method as their ' + f'base feature. Sampling method of base feature is ' + f'{sampling_method} but for lag ' + f'{kwargs['sampling_method']} was given') + kwargs.__delitem__('sampling_method') # constructor must not get more than one arg with the same key + + super().__init__(name, sampling_method=sampling_method, **kwargs) self.lag: int = lag def process(self, df: DataFrame) -> Series: @@ -236,7 +295,7 @@ def __init__(self, feature1: Union[FeatureBase, int, float], feature2: Union[Fea f2n = str(feature2) if name is None: name = self.name(f1n, f2n) - super().__init__(name) + super().__init__(name, **kwargs) self.feature1 = feature1 self.feature2 = feature2 @@ -414,7 +473,7 @@ def __init__(self, f1: FeatureBase, name: str = None, **kwargs): self.f1: FeatureBase = f1 if name is None: name = 'exp(' + f1.feature + ')' - super().__init__(name) + super().__init__(name, **kwargs) def process(self, df: DataFrame) -> Series: if self.feature not in df.columns: @@ -444,7 +503,7 @@ def __init__(self, f1: FeatureBase, name: str = None, **kwargs): self.f1: FeatureBase = f1 if name is None: name = 'sin(' + f1.feature + ')' - super().__init__(name) + super().__init__(name, **kwargs) def process(self, df: DataFrame) -> Series: if self.feature not in df.columns: @@ -474,7 +533,7 @@ def __init__(self, f1: FeatureBase, name: str = None, **kwargs): self.f1: FeatureBase = f1 if name is None: name = 'cos(' + f1.feature + ')' - super().__init__(name) + super().__init__(name, **kwargs) def process(self, df: DataFrame) -> Series: if self.feature not in df.columns: @@ -504,7 +563,7 @@ class FeatureConstant(FeatureBase): def __init__(self, c: float, name: str, **kwargs): self.c = c - super().__init__(name) + super().__init__(name, **kwargs) def process(self, df: DataFrame) -> Series: if self.feature not in df.columns: @@ -525,6 +584,21 @@ class FeatureConstruction: """ features = list[FeatureBase]() + _default_sampling_method = 'previous' + + @staticmethod + def get_default_sampling_method(): + return FeatureConstruction._default_sampling_method + + @staticmethod + def set_default_sampling_method(val: Union[str, int]): + """ + Sets the default sampling method for all features that do not have a custom sampling method. Available methods: + - 'current' or 0: Current time step will be used for prediction. + - 'previous' or 1: Previous time step will be used for prediction. + - 'mean_over_interval': Mean between current and previous time step will be used. + """ + FeatureConstruction._default_sampling_method = _return_valid_sampling_method(val) @staticmethod def reset(): diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index cdf26db..b044165 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -5,7 +5,7 @@ import pandas as pd import itertools from sklearn.model_selection import train_test_split -from physXAI.preprocessing.constructed import FeatureConstruction, FeatureLag +from physXAI.preprocessing.constructed import FeatureConstruction, FeatureBase from physXAI.preprocessing.training_data import TrainingData, TrainingDataMultiStep, TrainingDataGeneric from physXAI.utils.logging import get_full_path os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' @@ -194,7 +194,7 @@ def load_data(self, file_path: str) -> pd.DataFrame: return df - def filter_df_according_to_timestep(self, df: pd.DataFrame): + def sample_df_according_to_timestep(self, df: pd.DataFrame): filtering = (df.index - df.index[0]) % self.time_step == 0 df = df[filtering] return df @@ -276,7 +276,7 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: 1. Applies feature constructions defined in `FeatureConstruction`. 2. Selects relevant input and output columns. 3. Handles missing values by dropping rows. - 4. Applies the shift on each input variable. + 4. Applies the defined sampling method on each input variable. Args: df (pd.DataFrame): The input DataFrame. @@ -286,15 +286,6 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: and target (y) DataFrames. """ - # check if current inputs match inputs (keys) in shift dictionary and update shift if necessary - # required for recursive feature selection since inputs change after initialization of Preprocessing object - if (len(self.inputs) != len(self.shift.keys())) or not all(inp in self.shift.keys() for inp in self.inputs): - self.shift = convert_shift_to_dict(self.shift, self.inputs, custom_default=self.shift_default) - - assert len(self.inputs) == len(self.shift.keys()), ( - f"Something went wrong, number of inputs ({len(self.inputs)})" - f" doesn't match number of inputs defined in shift ({len(self.shift.keys())})") - # extract the names of all features in inputs and outputs that are based on lagged features lag_based_features = FeatureConstruction.get_features_including_lagged_features(self.inputs + self.output) @@ -304,6 +295,7 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: # Only apply for those features that are not lags since lags must be constructed after sampling the data # according to the given time step FeatureConstruction.process(df, feature_names=inputs_without_lags + [out for out in self.output if out not in inputs_without_lags]) + features_without_lags: list[FeatureBase] = [FeatureConstruction.get_feature(inp) for inp in inputs_without_lags] df = df[inputs_without_lags + [out for out in self.output if out not in inputs_without_lags]] @@ -334,60 +326,63 @@ def pairwise(iterable: Iterable): return x - # output is independent of shift -> filter / sample according to time step already + # output is independent of sampling of inputs -> sample according to time step already y = df[self.output].copy() - y = self.filter_df_according_to_timestep(y) + y = self.sample_df_according_to_timestep(y) X = df[inputs_without_lags].copy() - if all('current' == self.shift[k] for k in inputs_without_lags): + if all('current' == f.sampling_method for f in features_without_lags): # filter / sample data - X = self.filter_df_according_to_timestep(X) + X = self.sample_df_according_to_timestep(X) # nothing more to do here - elif all('previous' == self.shift[k] for k in inputs_without_lags): + elif all('previous' == f.sampling_method for f in features_without_lags): # filter / sample data - X = self.filter_df_according_to_timestep(X) + X = self.sample_df_according_to_timestep(X) # shift data by 1 and shorten DataFrames accordingly X = X.shift(1) y = y.iloc[1:] X = X.iloc[1:] - elif all('mean_over_interval' == self.shift[k] for k in inputs_without_lags): + elif all('mean_over_interval' == f.sampling_method for f in features_without_lags): X = get_mean_over_interval(y, X) # synchronize length between X and y y = y.iloc[1:] - else: # different inputs have different shifts + else: # different inputs have different sampling methods res = [] - for inp in inputs_without_lags: - # only process inputs with shift method mean_over_interval first since X cannot be filtered / sampled + previous_or_mean_in_sampling_methods = False + for f in features_without_lags: + # only process inputs with sampling method mean_over_interval first since X cannot be sampled # to the actual required time steps until the intermediate values were taken into the mean - if self.shift[inp] == 'mean_over_interval': - res.append(get_mean_over_interval(y, X[[inp]])) - - # filter / sample X according to required time step - X = self.filter_df_according_to_timestep(X) - # process inputs with shift methods 'current' and 'previous' - for inp in inputs_without_lags: - _x = X[[inp]] - if self.shift[inp] == 'current': + if f.sampling_method == 'mean_over_interval': + res.append(get_mean_over_interval(y, X[[f.feature]])) + previous_or_mean_in_sampling_methods = True + + # sample X according to required time step + X = self.sample_df_according_to_timestep(X) + # process inputs with sampling methods 'current' and 'previous' + for f in features_without_lags: + _x = X[[f.feature]] + if f.sampling_method == 'current': # no transformation needed res.append(_x) - elif self.shift[inp] == 'previous': + elif f.sampling_method == 'previous': # shift by 1 _x = _x.shift(1) _x = _x.iloc[1:] res.append(_x) - elif self.shift[inp] == 'mean_over_interval': + previous_or_mean_in_sampling_methods = True + elif f.sampling_method == 'mean_over_interval': continue else: - raise NotImplementedError(f"Shift method '{self.shift[inp]}' not implemented.") + raise NotImplementedError(f"Sampling method '{f.sampling_method}' not implemented.") X = pd.concat(res, axis=1) - # Shift methods 'previous' and 'mean_over_interval' reduce available data points by 1. + # Sampling methods 'previous' and 'mean_over_interval' reduce available data points by 1. # Therefore, lengths of X and y have to be synchronized - if 'previous' in self.shift.values() or 'mean_over_interval' in self.shift.values(): + if previous_or_mean_in_sampling_methods: y = y.iloc[1:] X = X.sort_index(ascending=True) X = X.iloc[1:] @@ -399,7 +394,8 @@ def pairwise(iterable: Iterable): res_df.dropna(inplace=True) else: raise ValueError( - "Data Error: The TrainingData contains NaN values in intermediate rows. If this is intended, set ignore_nan=True in PreprocessingSingleStep.") + "Data Error: The TrainingData contains NaN values in intermediate rows. If this is intended, set " + "ignore_nan=True in PreprocessingSingleStep.") # Applies feature constructions defined in `FeatureConstruction` to the lagged inputs FeatureConstruction.process(res_df, feature_names=lag_based_features) @@ -570,7 +566,7 @@ def process_data(self, df: pd.DataFrame) -> TrainingDataMultiStep: """ # filter data - df = self.filter_df_according_to_timestep(df) + df = self.sample_df_according_to_timestep(df) # Applies feature constructions defined in `FeatureConstruction`. FeatureConstruction.process(df) From e0fc769d29568a559d1957daa81638567121161e Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Sun, 7 Dec 2025 11:46:59 +0100 Subject: [PATCH 20/42] Implemented input list as list of Features and str before: only str allowed --- physXAI/preprocessing/constructed.py | 27 ++++++++++++++++++++++++++ physXAI/preprocessing/preprocessing.py | 12 ++++++------ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index a9e0b7a..19af47b 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -674,6 +674,33 @@ def recursive_search(feature): return res + @staticmethod + def process_inputs(inputs: list[Union[str, FeatureBase]]) -> list[str]: + """ + Creates a Feature for all inputs that are not yet created as features + + Args: + inputs (list(Union[str, FeatureBase])): List of column names or Features to be used as input features. + + Returns: + list[str]: list of column names of all input features + """ + + input_str = list() + + for inp in inputs: + if isinstance(inp, FeatureBase): + input_str.append(inp.feature) # get name of feature (which is used as column name) + elif isinstance(inp, str): + input_str.append(inp) + # check if a Feature with the given name (inp) was already created, otherwise create it + if not any(inp == f.feature for f in FeatureConstruction.features): + Feature(name=inp) + else: + raise TypeError(f"Only inputs with types 'str' or 'FeatureBase' allowed, got type {type(inp)} instead") + + return input_str + @staticmethod def process(df: DataFrame, feature_names: list[str] = None): """ diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index b044165..97b5310 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -98,7 +98,7 @@ class PreprocessingData(ABC): Abstract Preprocessing Class """ - def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: Union[int, str, dict] = 'previous', + def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, list[str]], shift: Union[int, str, dict] = 'previous', time_step: Optional[Union[int, float]] = None, test_size: float = 0.1, val_size: float = 0.1, random_state: int = 42, time_index_col: Union[str, float] = 0, csv_delimiter: str = ';', csv_encoding: str = 'latin1', @@ -107,7 +107,7 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: Unio Initializes the Preprocessing instance. Args: - inputs (List[str]): List of column names to be used as input features. + inputs (List[Union[str, FeatureBase]]): List of column names or Features to be used as input features. output (Union[str, List[str]]): Column name(s) for the target variable(s). shift (Union[int, str, dict]): Time step of the input data used to predict the output. - If a single int or str is given, it applies to all inputs. @@ -140,7 +140,7 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: Unio self.csv_header = csv_header self.csv_skiprows = csv_skiprows - self.inputs: list[str] = inputs + self.inputs: list[str] = FeatureConstruction.process_inputs(inputs) if isinstance(output, str): output = [output] self.output: list[str] = output @@ -230,7 +230,7 @@ class PreprocessingSingleStep(PreprocessingData): validation, and test sets. """ - def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: Union[int, str, dict] = 'previous', + def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, list[str]], shift: Union[int, str, dict] = 'previous', time_step: Optional[Union[int, float]] = None, test_size: float = 0.1, val_size: float = 0.1, random_state: int = 42, time_index_col: Union[str, float] = 0, csv_delimiter: str = ';', csv_encoding: str = 'latin1', @@ -239,7 +239,7 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], shift: Unio Initializes the PreprocessingSingleStep instance. Args: - inputs (List[str]): List of column names to be used as input features. + inputs (List[Union[str, FeatureBase]]): List of column names or Features to be used as input features. output (Union[str, List[str]]): Column name(s) for the target variable(s). shift (Union[int, str, dict]): Time step of the input data used to predict the output. - If a single int or str is given, it applies to all inputs. @@ -492,7 +492,7 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], label_width Initializes the PreprocessingMultiStep instance. Args: - inputs (List[str]): Column names for input features to the main RNN. + inputs (List[Union[str, FeatureBase]]): List of column names or Features that are input features to the main RNN. output (Union[str, List[str]]): Column name(s) for target variable(s). label_width (int): Number of time steps in the output (label) sequence. warmup_width (int): Number of time steps in the warmup sequence (for RNN state initialization). From be1365b350f200ab53351ca10ec3b88c57662530 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Sun, 7 Dec 2025 10:50:09 +0000 Subject: [PATCH 21/42] Update coverage badge [skip ci] --- build/reports/coverage.svg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build/reports/coverage.svg b/build/reports/coverage.svg index c149003..6963b3e 100644 --- a/build/reports/coverage.svg +++ b/build/reports/coverage.svg @@ -9,13 +9,13 @@ - + coverage coverage - 90% - 90% + 87% + 87% From 7d439f883f3e66c6824bd267831bb00919cb2f03 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Sun, 7 Dec 2025 11:58:16 +0100 Subject: [PATCH 22/42] Fix SyntaxError in python versions earlier than 3.12 --- physXAI/preprocessing/constructed.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index 19af47b..a8c5da1 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -246,10 +246,10 @@ def __init__(self, f: Union[FeatureBase, str], lag: int, name: str = None, **kwa sampling_method = FeatureConstruction.get_feature(f).sampling_method if 'sampling_method' in kwargs.keys(): - assert kwargs['sampling_method'] == sampling_method, (f'lags must have the same sampling method as their ' - f'base feature. Sampling method of base feature is ' - f'{sampling_method} but for lag ' - f'{kwargs['sampling_method']} was given') + assert kwargs['sampling_method'] == sampling_method, ( + f"lags must have the same sampling method as their base feature. Sampling method of base feature is" + f" {sampling_method} but for lag {kwargs['sampling_method']} was given as sampling method." + ) kwargs.__delitem__('sampling_method') # constructor must not get more than one arg with the same key super().__init__(name, sampling_method=sampling_method, **kwargs) From f160948bf084802fab25c71d681aca6043411ec2 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Sun, 7 Dec 2025 14:09:28 +0100 Subject: [PATCH 23/42] Added DeprecationWarning for shift parameter, updated testing and example --- .../Dummy_shifting.py | 73 ----------------- .../different_sampling_methods.py | 78 +++++++++++++++++++ physXAI/preprocessing/preprocessing.py | 67 ++++++---------- unittests/test_coverage.py | 37 ++++++--- 4 files changed, 126 insertions(+), 129 deletions(-) delete mode 100644 executables/bestest_hydronic_heat_pump/Dummy_shifting.py create mode 100644 executables/bestest_hydronic_heat_pump/different_sampling_methods.py diff --git a/executables/bestest_hydronic_heat_pump/Dummy_shifting.py b/executables/bestest_hydronic_heat_pump/Dummy_shifting.py deleted file mode 100644 index 48859bb..0000000 --- a/executables/bestest_hydronic_heat_pump/Dummy_shifting.py +++ /dev/null @@ -1,73 +0,0 @@ -from physXAI.models.ann.ann_design import ClassicalANNModel -from physXAI.preprocessing.preprocessing import PreprocessingSingleStep -from physXAI.preprocessing.constructed import Feature -from physXAI.utils.logging import Logger - - -""" -This script demonstrates the usage of different shifts. It is not physically meaningful. -""" -# Setup up logger for saving -Logger.setup_logger(folder_name='Dummy_shifting_ann', override=True) - -# File path to data -file_path = r"data/bestest_hydronic_heat_pump/pid_data.csv" - -# List of input features. Can include constructed features and lagged inputs -inputs = ['reaTZon_y', 'reaTZon_y_lag1', 'reaTZon_y_lag2', 'weaSta_reaWeaTDryBul_y', 'weaSta_reaWeaTDryBul_y_lag1', - 'weaSta_reaWeaHDirNor_y', 'oveHeaPumY_u', 'oveHeaPumY_u_lag1', 'oveHeaPumY_u_lag2'] -# Output feature -output = 'Change(T_zone)' - -""" -The constructed features are automatically added to the data via 'physXAI.preprocessing.constructed.py' -Lagged inputs can be added directly based on the feature -""" -x1 = Feature('reaTZon_y') -x1.lag(2) # reaTZon_y_lag1, reaTZon_y_lag2 -x2 = Feature('weaSta_reaWeaTDryBul_y') -x2.lag(1) # weaSta_reaWeaTDryBul_y_lag1 -x3 = Feature('oveHeaPumY_u') -x3.lag(2) # oveHeaPumY_u_lag1, oveHeaPumY_u_lag2 - - -""" -shift (Union[int, str, dict]): Time step of the input data used to predict the output. - - If a single int or str is given, it applies to all inputs. - - If a dict is provided, it can specify different shifts for individual inputs. - - If not all inputs are specified in the dict, unspecified inputs will use a default value (autocomplete). - Examples: - - shift = 0 or shift = 'current': Current time step will be used for prediction. - - shift = 1 or shift = 'previous': Previous values will be used for prediction. - - shift = 'mean_over_interval': Mean between current and previous time step will be used. - - shift = { - 'inp_1': 1, - 'inp_2': 'mean_over_interval', - '_default': 0, # current time step will be used for all inputs not specified in the dict - # If no custom default value is given in dict, 'previous' will be used as default - } -""" -shift = { - 'reaTZon_y': 'previous', # for all lags of reaTZon_y, the shift will be set automatically - 'weaSta_reaWeaHDirNor_y': 'mean_over_interval', - '_default': 0, -} - -# Create Training data -# Time step defines target sampling: if original sampling of data is in 15min intervals, it is resampled to 1h intervals for time_step=4 -# Hence, if the shift method of an input is defined as 'mean_over_interval', the mean over the last hour is taken as input -prep = PreprocessingSingleStep(inputs, output, shift=shift, time_step=4) - -# Process Training data -td = prep.pipeline(file_path) - -# Classical ANN -m = ClassicalANNModel(epochs=50) - -# Training pipeline -model = m.pipeline(td) - -# Log setup of preprocessing and model as json -Logger.log_setup(prep, m) -# Log training data as pickle -Logger.save_training_data(td) \ No newline at end of file diff --git a/executables/bestest_hydronic_heat_pump/different_sampling_methods.py b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py new file mode 100644 index 0000000..e292a6f --- /dev/null +++ b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py @@ -0,0 +1,78 @@ +from physXAI.models.ann.ann_design import ClassicalANNModel +from physXAI.preprocessing.preprocessing import PreprocessingSingleStep +from physXAI.preprocessing.constructed import Feature, FeatureConstruction, FeatureExp +from physXAI.utils.logging import Logger + +""" +This script demonstrates the usage of different sampling methods. It is not physically meaningful. + +When creating a Feature (or any subclass of FeatureBase like FeatureLag, FeatureAdd etc.), a sampling method can be +specified. + +sampling_method (Union[str, int]): Time step of the input data used to predict the output. + - if None: FeatureConstruction.get_default_sampling_method() is used + - if 'current' or 0: Current time step will be used for prediction. + - if 'previous' or 1: Previous time step will be used for prediction. + - if 'mean_over_interval': Mean between current and previous time step will be used. + + Specify default sampling method using FeatureConstruction.set_default_sampling_method(). + If no default sampling method is specified by the user, 'previous' is used as default. +""" +FeatureConstruction.set_default_sampling_method(0) + +# Setup up logger for saving +Logger.setup_logger(folder_name='different_sampling_methods_ann', override=True) + +# File path to data +file_path = r"data/bestest_hydronic_heat_pump/pid_data.csv" + +# List of input features. Can include names of constructed features and lagged inputs +inputs = ['reaTZon_y', 'reaTZon_y_lag1', 'reaTZon_y_lag2', 'weaSta_reaWeaTDryBul_y', 'weaSta_reaWeaTDryBul_y_lag1', + Feature('weaSta_reaWeaHDirNor_y', sampling_method='mean_over_interval'), 'oveHeaPumY_u', + 'oveHeaPumY_u_lag1', 'oveHeaPumY_u_lag2'] +# Output feature +output = 'Change(T_zone)' + +""" +The constructed features are automatically added to the data via 'physXAI.preprocessing.constructed.py' +Lagged inputs can be added directly based on the feature +""" +# create lags of reaTZon_y: reaTZon_y_lag1, reaTZon_y_lag2 +x1 = Feature('reaTZon_y', sampling_method='previous') +lx1 = x1.lag(2) # for all lags of reaTZon_y, the shift will be set automatically as 'previous' + +# create lag of weaSta_reaWeaTDryBul_y: weaSta_reaWeaTDryBul_y_lag1 +x2 = Feature('weaSta_reaWeaTDryBul_y') +lx2 = x2.lag(1) + +# create lag of oveHeaPumY_u: oveHeaPumY_u_lag1, oveHeaPumY_u_lag2 +x3 = Feature('oveHeaPumY_u') +x3.lag(2) + +# dummy Features +y = x1 + lx1[0] +z = y + x1 +z.rename('example_feature_two') +z.sampling_method = 'mean_over_interval' +e = FeatureExp(x1-273.15, 'exp', sampling_method=1) # reduce x1 by 273.15, otherwise values are too high + +# add dummy features to inputs +inputs.extend([z, e]) + +# Create Training data +# Time step defines target sampling: if original sampling of data is in 15min intervals, it is resampled to 1h intervals +# for time_step=4. Hence, if the shift method of an input is defined as 'mean_over_interval', the mean over the last +# hour is taken as input +prep = PreprocessingSingleStep(inputs, output, time_step=4) + +# Process Training data +td = prep.pipeline(file_path) + +# Build & train Classical ANN +m = ClassicalANNModel(epochs=50) +model = m.pipeline(td) + +# Log setup of preprocessing and model as json +Logger.log_setup(prep, m) +# Log training data as pickle +Logger.save_training_data(td) diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index 97b5310..fa6f82c 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -98,31 +98,17 @@ class PreprocessingData(ABC): Abstract Preprocessing Class """ - def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, list[str]], shift: Union[int, str, dict] = 'previous', + def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, list[str]], time_step: Optional[Union[int, float]] = None, test_size: float = 0.1, val_size: float = 0.1, random_state: int = 42, time_index_col: Union[str, float] = 0, csv_delimiter: str = ';', csv_encoding: str = 'latin1', - csv_header: int = 0, csv_skiprows: Union[int, list[int]] = [], ignore_nan: bool = False): + csv_header: int = 0, csv_skiprows: Union[int, list[int]] = [], ignore_nan: bool = False, **kwargs): """ Initializes the Preprocessing instance. Args: inputs (List[Union[str, FeatureBase]]): List of column names or Features to be used as input features. output (Union[str, List[str]]): Column name(s) for the target variable(s). - shift (Union[int, str, dict]): Time step of the input data used to predict the output. - - If a single int or str is given, it applies to all inputs. - - If a dict is provided, it can specify different shifts for individual inputs. - - If not all inputs are specified in the dict, unspecified inputs will use a default value (autocomplete). - Examples: - - shift = 0 or shift = 'current': Current time step will be used for prediction. - - shift = 1 or shift = 'previous': Previous values will be used for prediction. - - shift = 'mean_over_interval': Mean between current and previous time step will be used. - - shift = { - 'inp_1': 1, - 'inp_2': 'mean_over_interval', - '_default': 0, # current time step will be used for all inputs not specified in the dict - # If no custom default value is given in dict, 'previous' will be used as default - } time_step (Optional[Union[int, float]]): Optional time step sampling. If None, sampling of data is used. test_size (float): Proportion of the dataset to allocate to the test set. val_size (float): Proportion of the dataset to allocate to the validation set. @@ -145,13 +131,6 @@ def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, lis output = [output] self.output: list[str] = output - if isinstance(shift, dict) and '_default' in shift.keys(): - self.shift_default = shift['_default'] - shift.__delitem__('_default') - else: - self.shift_default = None - self.shift: dict = convert_shift_to_dict(shift, inputs, custom_default=self.shift_default) - self.time_step = time_step # Training, validation and test size should be equal to 1 @@ -230,7 +209,7 @@ class PreprocessingSingleStep(PreprocessingData): validation, and test sets. """ - def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, list[str]], shift: Union[int, str, dict] = 'previous', + def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, list[str]], time_step: Optional[Union[int, float]] = None, test_size: float = 0.1, val_size: float = 0.1, random_state: int = 42, time_index_col: Union[str, float] = 0, csv_delimiter: str = ';', csv_encoding: str = 'latin1', @@ -241,20 +220,6 @@ def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, lis Args: inputs (List[Union[str, FeatureBase]]): List of column names or Features to be used as input features. output (Union[str, List[str]]): Column name(s) for the target variable(s). - shift (Union[int, str, dict]): Time step of the input data used to predict the output. - - If a single int or str is given, it applies to all inputs. - - If a dict is provided, it can specify different shifts for individual inputs. - - If not all inputs are specified in the dict, unspecified inputs will use a default value (autocomplete). - Examples: - - shift = 0 or shift = 'current': Current time step will be used for prediction. - - shift = 1 or shift = 'previous': Previous values will be used for prediction. - - shift = 'mean_over_interval': Mean between current and previous time step will be used. - - shift = { - 'inp_1': 1, - 'inp_2': 'mean_over_interval', - '_default': 0, # current time step will be used for all inputs not specified in the dict - # If no custom default value is given in dict, 'previous' will be used as default - } time_step (Optional[Union[int, float]]): Optional time step sampling. If None, sampling of data is used. test_size (float): Proportion of the dataset to allocate to the test set. val_size (float): Proportion of the dataset to allocate to the validation set. @@ -267,8 +232,23 @@ def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, lis ignore_nan (bool): If True, rows with NaN values will be dropped. If False, an error is raised if NaNs are present. Default is False. """ - super().__init__(inputs, output, shift, time_step, test_size, val_size, random_state, time_index_col, - csv_delimiter, csv_encoding, csv_header, csv_skiprows, ignore_nan) + if 'shift' in kwargs.keys(): + DeprecationWarning( + "shift parameter is deprecated for SingleStep models and replaced by sampling_method, an attribute of " + "each Feature. This allows specifying individual 'shifts' for each Feature / input. A default sampling" + "method can be specified via FeatureConstruction.set_default_sampling_method()." + ) + DeprecationWarning( + f"shift parameter was given as shift={kwargs['shift']}. Setting FeatureConstruction.set_default_" + f"sampling_method(shift) and override possible individual sampling methods of all Features. If this is" + f"not intended, remove shift parameter when initializing PreprocessingSingleStep object!" + ) + FeatureConstruction.set_default_sampling_method(kwargs['shift']) + for f in FeatureConstruction.features: + f.sampling_method = kwargs['shift'] + + super().__init__(inputs, output, time_step, test_size, val_size, random_state, time_index_col, + csv_delimiter, csv_encoding, csv_header, csv_skiprows, ignore_nan, **kwargs) def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """ @@ -462,7 +442,6 @@ def get_config(self) -> dict: '__class_name__': self.__class__.__name__, 'inputs': self.inputs, 'output': self.output, - 'shift': self.shift, 'test_size': self.test_size, 'val_size': self.val_size, 'random_state': self.random_state, @@ -475,7 +454,7 @@ def from_config(cls, config: dict) -> 'PreprocessingSingleStep': return cls(**config) -class PreprocessingMultiStep (PreprocessingData): +class PreprocessingMultiStep(PreprocessingData): """ Handles preprocessing for multi-step forecasting models, typically RNNs. This involves creating windowed datasets suitable for sequence models, @@ -487,7 +466,7 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], label_width test_size: float = 0.1, val_size: float = 0.1, random_state: int = 42, time_index_col: Union[str, float] = 0, csv_delimiter: str = ';', csv_encoding: str = 'latin1', csv_header: int = 0, csv_skiprows: Union[int, list[int]] = [], - overlapping_sequences: bool = True, batch_size=32, init_features: list[str] = None,**kwargs): + overlapping_sequences: bool = True, batch_size=32, init_features: list[str] = None, **kwargs): """ Initializes the PreprocessingMultiStep instance. @@ -514,7 +493,7 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], label_width If None and warmup_width > 0, defaults to `inputs`. If None and warmup_width <= 0, defaults to empty list. """ - super().__init__(inputs, output, shift, time_step, test_size, val_size, random_state, time_index_col, + super().__init__(inputs, output, time_step, test_size, val_size, random_state, time_index_col, csv_delimiter, csv_encoding, csv_header, csv_skiprows) self.overlapping_sequences = overlapping_sequences diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index 54ba351..fcac12f 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -9,7 +9,7 @@ from physXAI.utils.logging import Logger, get_parent_working_directory from physXAI.preprocessing.preprocessing import PreprocessingSingleStep, PreprocessingMultiStep, \ PreprocessingData, convert_shift_to_dict -from physXAI.preprocessing.constructed import Feature, FeatureConstruction, FeatureConstant +from physXAI.preprocessing.constructed import Feature, FeatureConstruction, FeatureConstant, FeatureExp from physXAI.feature_selection.recursive_feature_elimination import recursive_feature_elimination_pipeline from physXAI.models.models import LinearRegressionModel, AbstractModel from physXAI.models.ann.ann_design import ClassicalANNModel, CMNNModel, LinANNModel, PINNModel, RNNModel, \ @@ -40,7 +40,8 @@ def inputs_tair(): @pytest.fixture(scope='module') def inputs_tair_extended(): return ['reaTZon_y', 'reaTZon_y_lag1', 'reaTZon_y_lag2', 'weaSta_reaWeaTDryBul_y', 'weaSta_reaWeaTDryBul_y_lag1', - 'weaSta_reaWeaHDirNor_y', 'oveHeaPumY_u', 'oveHeaPumY_u_lag1', 'oveHeaPumY_u_lag2'] + Feature('weaSta_reaWeaHDirNor_y', sampling_method='mean_over_interval'), 'oveHeaPumY_u', + 'oveHeaPumY_u_lag1', 'oveHeaPumY_u_lag2'] @pytest.fixture(scope='module') def output_php(): @@ -232,32 +233,44 @@ def tair_data_total(file_path, inputs_tair, output_tair): td = prep.pipeline(file_path) return prep, td -def test_shifting(file_path, inputs_tair_extended, output_tair): +def test_sampling_methods(file_path, inputs_tair_extended, output_tair): # Setup up logger for saving Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) + FeatureConstruction.set_default_sampling_method(0) + # Create lags - x1 = Feature('reaTZon_y') - x1.lag(2) # reaTZon_y_lag1, reaTZon_y_lag2 + x1 = Feature('reaTZon_y', sampling_method='previous') + lx1 = x1.lag(2) # reaTZon_y_lag1, reaTZon_y_lag2 x2 = Feature('weaSta_reaWeaTDryBul_y') - x2.lag(1) # weaSta_reaWeaTDryBul_y_lag1 + lx2 = x2.lag(1) # weaSta_reaWeaTDryBul_y_lag1 x3 = Feature('oveHeaPumY_u') x3.lag(2) # oveHeaPumY_u_lag1, oveHeaPumY_u_lag2 - shift = { - 'reaTZon_y': 'previous', # for all lags of reaTZon_y, the shift will be set automatically - 'weaSta_reaWeaHDirNor_y': 'mean_over_interval', - '_default': 0, - } + # dummy Features + y = x1 + lx1[0] + z = y + x1 + z.rename('test_feature_two') + z.sampling_method = 'mean_over_interval' + e = FeatureExp(x1-273.15, 'exp', sampling_method=1) # reduce x1 by 273.15, otherwise values are too high + + inputs_tair_extended.extend([z, e]) # Create & process Training data - prep = PreprocessingSingleStep(inputs_tair_extended, output_tair, shift=shift, time_step=4) + prep = PreprocessingSingleStep(inputs_tair_extended, output_tair, time_step=4) td = prep.pipeline(file_path) # Build & train Classical ANN m = ClassicalANNModel(epochs=1) model = m.pipeline(td) + # check correct sampling_method specification + assert x1.sampling_method == 'previous' and lx1[1].sampling_method == 'previous' + assert x2.sampling_method == 'current' and lx2.sampling_method == 'current' + assert FeatureConstruction.get_feature('weaSta_reaWeaHDirNor_y').sampling_method == 'mean_over_interval' + assert FeatureConstruction.get_feature('test_feature_two').sampling_method == 'mean_over_interval' + assert e.sampling_method == 'previous' + def test_model_linReg(inputs_php, output_php, file_path): # Setup up logger for saving Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) From 11d1719f759b31b02f4e1b8d5b54f683f89eac9e Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Sun, 7 Dec 2025 17:26:04 +0100 Subject: [PATCH 24/42] Fixed small mistake regarding DataFrame length --- physXAI/preprocessing/preprocessing.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index fa6f82c..923cb50 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -331,13 +331,13 @@ def pairwise(iterable: Iterable): else: # different inputs have different sampling methods res = [] - previous_or_mean_in_sampling_methods = False + previous_or_mean_in_sampling_methods = [] for f in features_without_lags: # only process inputs with sampling method mean_over_interval first since X cannot be sampled # to the actual required time steps until the intermediate values were taken into the mean if f.sampling_method == 'mean_over_interval': res.append(get_mean_over_interval(y, X[[f.feature]])) - previous_or_mean_in_sampling_methods = True + previous_or_mean_in_sampling_methods.append(True) # sample X according to required time step X = self.sample_df_according_to_timestep(X) @@ -347,25 +347,28 @@ def pairwise(iterable: Iterable): if f.sampling_method == 'current': # no transformation needed res.append(_x) + previous_or_mean_in_sampling_methods.append(False) elif f.sampling_method == 'previous': # shift by 1 _x = _x.shift(1) _x = _x.iloc[1:] res.append(_x) - previous_or_mean_in_sampling_methods = True + previous_or_mean_in_sampling_methods.append(True) elif f.sampling_method == 'mean_over_interval': continue else: raise NotImplementedError(f"Sampling method '{f.sampling_method}' not implemented.") X = pd.concat(res, axis=1) + X = X.sort_index(ascending=True) # Sampling methods 'previous' and 'mean_over_interval' reduce available data points by 1. # Therefore, lengths of X and y have to be synchronized - if previous_or_mean_in_sampling_methods: + if any(previous_or_mean_in_sampling_methods): y = y.iloc[1:] - X = X.sort_index(ascending=True) - X = X.iloc[1:] + # if at least one of the features uses 'current' as sampling method, shorten X + if not all(previous_or_mean_in_sampling_methods): + X = X.iloc[1:] res_df = pd.concat([X, y], axis=1) From 8d9af45a4a5bc5023f011849228c77220c9c7a46 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Sun, 7 Dec 2025 17:26:41 +0100 Subject: [PATCH 25/42] Fixed testing bug --- unittests/test_coverage.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index fcac12f..524ed73 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -85,6 +85,8 @@ def test_preprocessing(monkeypatch, file_path, inputs_php, output_php): prep = PreprocessingSingleStep(inputs_php, output_php) prep.pipeline(file_path) + FeatureConstruction.reset() + def test_preprocessing_multistep(file_path, inputs_tair, output_tair): Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) @@ -100,6 +102,8 @@ def test_preprocessing_multistep(file_path, inputs_tair, output_tair): overlapping_sequences=False, batch_size=1) prep.pipeline(file_path) + FeatureConstruction.reset() + class TestPreprocessingShiftConversion(TestCase): inputs = ['reaTZon_y', 'reaTZon_y_lag1', 'reaTZon_y_lag2', 'weaSta_reaWeaTDryBul_y', 'weaSta_reaWeaTDryBul_y_lag1', @@ -271,6 +275,8 @@ def test_sampling_methods(file_path, inputs_tair_extended, output_tair): assert FeatureConstruction.get_feature('test_feature_two').sampling_method == 'mean_over_interval' assert e.sampling_method == 'previous' + FeatureConstruction.reset() + def test_model_linReg(inputs_php, output_php, file_path): # Setup up logger for saving Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) From 8f8c377950cacba752d79663c5198c7bc485c5e5 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Sun, 7 Dec 2025 16:30:08 +0000 Subject: [PATCH 26/42] Update coverage badge [skip ci] --- build/reports/coverage.svg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/reports/coverage.svg b/build/reports/coverage.svg index 6963b3e..1c7007c 100644 --- a/build/reports/coverage.svg +++ b/build/reports/coverage.svg @@ -15,7 +15,7 @@ coverage coverage - 87% - 87% + 89% + 89% From c42f42404b7ceb4d929b75563df66dad0ab293ea Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 10 Dec 2025 13:20:28 +0100 Subject: [PATCH 27/42] Updated testing for sampling method as attribute of Feature deleted deprecated code and test for shift conversion --- physXAI/preprocessing/constructed.py | 5 + physXAI/preprocessing/preprocessing.py | 80 --------- unittests/test_coverage.py | 231 +++++++++++++------------ 3 files changed, 123 insertions(+), 193 deletions(-) diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index a8c5da1..341f319 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -6,6 +6,10 @@ def _return_valid_sampling_method(v: Union[int, str]): """ check the validity of the given sampling method and return a string if val is int """ + + if not isinstance(v, (int, str)): + raise TypeError(f'Type of sampling method not supported. Type is {type(v)}, must be int or str.') + if v in ['current', 0]: return 'current' elif v in ['previous', 1]: @@ -604,6 +608,7 @@ def set_default_sampling_method(val: Union[str, int]): def reset(): """Clears all registered features and input names.""" FeatureConstruction.features = list[FeatureBase]() + FeatureConstruction.set_default_sampling_method('previous') @staticmethod def append(f: FeatureBase): diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index 923cb50..1b8a5ff 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -13,86 +13,6 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' -def convert_shift_to_dict(s: Union[int, str, dict], inputs: list[str], custom_default: Union[int, str] = None) -> dict: - """ - Convert a given shift variable (int, str) into a dictionary in which a shift is defined for every input. - If a dictionary is given as shift, check entries and autocomplete dict if necessary. - - Args: - s (Union[int, str, dict]): Shift value. Either a single string or int which then will be applied to all the inputs or - a dictionary in which a different shift can be defined for each input. If the dictionary does not specify the - shift for all inputs, the shift for inputs not specified is set to the default value (autocomplete) - inputs (list(str)): List of Input variables - custom_default (Union[int, str]): if no custom default is specified, 'previous' is used as default shift - """ - - def return_valid_shift(val: Union[int, str]): - """ check the validity of the given shift and return a string if val is int """ - if val in ['current', 0]: - val = 'current' - elif val in ['previous', 1]: - val = 'previous' - elif val == 'mean_over_interval': - val = 'mean_over_interval' - else: - raise ValueError( - f"Value of shift not supported, value is: {val}. Shift must be 'current' (or 0 if s is int), " - f"'previous' (or 1 if s is int) or 'mean_over_interval'.") - return val - - # set custom default or - if no custom default is specified - use 'previous' as default - default = 'previous' if custom_default is None else return_valid_shift(custom_default) - - if isinstance(s, (int, str)): - d = {} - s = return_valid_shift(s) - - # add shift for each input - for inp in inputs: - d.update({inp: s}) - return d - - elif isinstance(s, dict): - def get_lag(inputs: list[str], current_input: str) -> int: - """ get lag of current input """ - count = 0 - for inp in inputs: - spl = inp.split(current_input) # make sure it is the current input - if spl[0] == '' and spl[1] != '' and spl[1].split('_lag')[0] == '': - count += 1 - return count - - # check if lags exist - d = {} - inputs_without_lags = {} - for inp in inputs: - # skip if current input is just the lag of another inp - if not inp.__contains__('_lag'): - inputs_without_lags.update({inp: get_lag(inputs, inp)}) - - for inp in inputs_without_lags.keys(): - # if an input has a shift assigned already, the validity is checked - # otherwise default value is assigned - if inp in s.keys(): - d.update({inp: return_valid_shift(s[inp])}) - else: - d.update({inp: default}) - - # all inputs with lags should have the same shift - if inputs_without_lags[inp] > 0: # if current input has lags - for i in range(inputs_without_lags[inp]): - name = inp + '_lag' + str(i+1) - - # if a shift was already defined for this lag, check if it matches the shift of the original inp - if name in s.keys(): - assert return_valid_shift(s[name]) == d[inp], \ - 'Make sure that all lags of an input have the same shift' - d.update({name: d[inp]}) - return d - else: - raise TypeError(f'shift must be of type int, str or dict, is type {type(s)}') - - class PreprocessingData(ABC): """ Abstract Preprocessing Class diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index 524ed73..cd7a910 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -8,14 +8,13 @@ ###################################################################################################################### from physXAI.utils.logging import Logger, get_parent_working_directory from physXAI.preprocessing.preprocessing import PreprocessingSingleStep, PreprocessingMultiStep, \ - PreprocessingData, convert_shift_to_dict -from physXAI.preprocessing.constructed import Feature, FeatureConstruction, FeatureConstant, FeatureExp + PreprocessingData +from physXAI.preprocessing.constructed import Feature, FeatureConstruction, FeatureConstant, FeatureExp, FeatureLag from physXAI.feature_selection.recursive_feature_elimination import recursive_feature_elimination_pipeline from physXAI.models.models import LinearRegressionModel, AbstractModel from physXAI.models.ann.ann_design import ClassicalANNModel, CMNNModel, LinANNModel, PINNModel, RNNModel, \ RBFModel - base_path = os.path.join(pathlib.Path(__file__).resolve().parent.parent, 'stored_data') @@ -25,18 +24,22 @@ def disable_plotly_show(): with patch('plotly.graph_objects.Figure.show'): yield + @pytest.fixture(scope='module') def file_path(): return os.path.join(pathlib.Path(__file__).resolve().parent.parent, "data/bestest_hydronic_heat_pump/pid_data.csv") + @pytest.fixture(scope='module') def inputs_php(): return ['oveHeaPumY_u', 'Func(logistic)', 'weaSta_reaWeaTDryBul_y', 'reaTZon_y'] + @pytest.fixture(scope='module') def inputs_tair(): return ['reaTZon_y', 'weaSta_reaWeaTDryBul_y', 'oveHeaPumY_u', 'oveHeaPumY_u_lag1'] + @pytest.fixture(scope='module') def inputs_tair_extended(): return ['reaTZon_y', 'reaTZon_y_lag1', 'reaTZon_y_lag2', 'weaSta_reaWeaTDryBul_y', 'weaSta_reaWeaTDryBul_y_lag1', @@ -47,13 +50,16 @@ def inputs_tair_extended(): def output_php(): return 'reaPHeaPum_y' + @pytest.fixture(scope='module') def output_tair(): return 'Change(T_zone)' + def test_path_setup(): get_parent_working_directory() + def test_preprocessing(monkeypatch, file_path, inputs_php, output_php): monkeypatch.setattr('builtins.input', lambda _: "Y") @@ -87,6 +93,7 @@ def test_preprocessing(monkeypatch, file_path, inputs_php, output_php): FeatureConstruction.reset() + def test_preprocessing_multistep(file_path, inputs_tair, output_tair): Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) @@ -104,85 +111,112 @@ def test_preprocessing_multistep(file_path, inputs_tair, output_tair): FeatureConstruction.reset() -class TestPreprocessingShiftConversion(TestCase): - - inputs = ['reaTZon_y', 'reaTZon_y_lag1', 'reaTZon_y_lag2', 'weaSta_reaWeaTDryBul_y', 'weaSta_reaWeaTDryBul_y_lag1', - 'weaSta_reaWeaHDirNor_y', 'oveHeaPumY_u', 'oveHeaPumY_u_lag1', 'oveHeaPumY_u_lag2'] - # test case: int given for shift - def test_int(self): - shift = 0 - res = convert_shift_to_dict(shift, self.inputs) - res_expected = {'reaTZon_y': 'current', 'reaTZon_y_lag1': 'current', 'reaTZon_y_lag2': 'current', - 'weaSta_reaWeaTDryBul_y': 'current', 'weaSta_reaWeaTDryBul_y_lag1': 'current', - 'weaSta_reaWeaHDirNor_y': 'current', 'oveHeaPumY_u': 'current', 'oveHeaPumY_u_lag1': 'current', - 'oveHeaPumY_u_lag2': 'current'} - assert res == res_expected +class TestSamplingMethodsFaults(TestCase): - # test case: unsupported int given for shift - def test_unsupported_int(self): - shift = 2 - with self.assertRaises(ValueError): - convert_shift_to_dict(shift, self.inputs) - - # test case: str given for shift - def test_str(self): - shift = 'mean_over_interval' - res = convert_shift_to_dict(shift, self.inputs) - res_expected = {'reaTZon_y': 'mean_over_interval', 'reaTZon_y_lag1': 'mean_over_interval', - 'reaTZon_y_lag2': 'mean_over_interval', 'weaSta_reaWeaTDryBul_y': 'mean_over_interval', - 'weaSta_reaWeaTDryBul_y_lag1': 'mean_over_interval', - 'weaSta_reaWeaHDirNor_y': 'mean_over_interval', 'oveHeaPumY_u': 'mean_over_interval', - 'oveHeaPumY_u_lag1': 'mean_over_interval', 'oveHeaPumY_u_lag2': 'mean_over_interval'} - assert res == res_expected - - # test case: unsupported str given for shift + # test case: unsupported str given as sampling method def test_unsupported_str(self): - shift = 'test' with self.assertRaises(ValueError): - convert_shift_to_dict(shift, self.inputs) + FeatureConstruction.set_default_sampling_method('test') - # test case: unsupported type given for shift + # test case: unsupported type given for sampling method def test_unsupported_type(self): - shift = ['previous'] with self.assertRaises(TypeError): - convert_shift_to_dict(shift, self.inputs) - - # test case: autocomplete incomplete dictionary given for shift - def test_autocomplete_incomplete_dict(self): - shift = {'reaTZon_y': 0, 'reaTZon_y_lag1': 0, 'weaSta_reaWeaTDryBul_y': 'mean_over_interval'} - - # previous is default for all inputs that are not specified - res = convert_shift_to_dict(shift, self.inputs) - res_expected = {'reaTZon_y': 'current', 'reaTZon_y_lag1': 'current', 'reaTZon_y_lag2': 'current', - 'weaSta_reaWeaTDryBul_y': 'mean_over_interval', - 'weaSta_reaWeaTDryBul_y_lag1': 'mean_over_interval', - 'weaSta_reaWeaHDirNor_y': 'previous', 'oveHeaPumY_u': 'previous', - 'oveHeaPumY_u_lag1': 'previous', - 'oveHeaPumY_u_lag2': 'previous'} - assert len(res) == len(self.inputs) - assert res == res_expected - - # test case: autocomplete incomplete dictionary given for shift with custom default - def test_autocomplete_incomplete_dict_with_custom_default(self): - shift = {'reaTZon_y': 1, 'reaTZon_y_lag1': 1, 'weaSta_reaWeaTDryBul_y': 'mean_over_interval'} - - # previous is default for all inputs that are not specified - res = convert_shift_to_dict(shift, self.inputs, custom_default=0) - res_expected = {'reaTZon_y': 'previous', 'reaTZon_y_lag1': 'previous', 'reaTZon_y_lag2': 'previous', - 'weaSta_reaWeaTDryBul_y': 'mean_over_interval', - 'weaSta_reaWeaTDryBul_y_lag1': 'mean_over_interval', - 'weaSta_reaWeaHDirNor_y': 'current', 'oveHeaPumY_u': 'current', - 'oveHeaPumY_u_lag1': 'current', - 'oveHeaPumY_u_lag2': 'current'} - assert len(res) == len(self.inputs) - assert res == res_expected - - # test case: lags of the same input have mismatching shifts + FeatureConstruction.set_default_sampling_method(['current']) + + # test case: lags of the same input have mismatching sampling methods def test_lag_with_mismatching_shifts(self): - shift = {'reaTZon_y': 0, 'reaTZon_y_lag1': 1, 'weaSta_reaWeaTDryBul_y': 'mean_over_interval'} + + x = Feature('test', sampling_method='current') with self.assertRaises(AssertionError): - convert_shift_to_dict(shift, self.inputs) + FeatureLag(x, lag=1, sampling_method='previous') + FeatureConstruction.reset() + + +def test_sampling_method_use_default(file_path, inputs_tair, output_tair): + """test case: use default sampling when no default is specified by user""" + + # when not overriding default sampling method, 'previous' is used + + x = Feature('oveHeaPumY_u') + x.lag(1) + + # Create & process Training data + prep = PreprocessingSingleStep(inputs_tair, output_tair) + td = prep.pipeline(file_path) + + assert len(inputs_tair) == len(FeatureConstruction.features) + + for inp in inputs_tair: + f = FeatureConstruction.get_feature(inp) + assert f.sampling_method == 'previous' + + FeatureConstruction.reset() + + +def test_sampling_method_str(file_path, inputs_tair, output_tair): + """test case: set default using str (setting default with int is done in test_different_sampling_methods)""" + + FeatureConstruction.set_default_sampling_method('mean_over_interval') + + x = Feature('oveHeaPumY_u') + x.lag(1) + + # Create & process Training data + prep = PreprocessingSingleStep(inputs_tair, output_tair, time_step=4) + td = prep.pipeline(file_path) + + assert len(inputs_tair) == len(FeatureConstruction.features) + + for inp in inputs_tair: + f = FeatureConstruction.get_feature(inp) + assert f.sampling_method == 'mean_over_interval' + + FeatureConstruction.reset() + + +def test_different_sampling_methods(file_path, inputs_tair_extended, output_tair): + """test case: different sampling methods given""" + + # Setup up logger for saving + Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) + + # set default + FeatureConstruction.set_default_sampling_method(0) + + # Create lags + x1 = Feature('reaTZon_y', sampling_method='previous') + lx1 = x1.lag(2) # reaTZon_y_lag1, reaTZon_y_lag2 + x2 = Feature('weaSta_reaWeaTDryBul_y') + lx2 = x2.lag(1) # weaSta_reaWeaTDryBul_y_lag1 + x3 = Feature('oveHeaPumY_u') + x3.lag(2) # oveHeaPumY_u_lag1, oveHeaPumY_u_lag2 + + # dummy Features + y = x1 + lx1[0] + z = y + x1 + z.rename('test_feature_two') + z.sampling_method = 'mean_over_interval' + e = FeatureExp(x1 - 273.15, 'exp', sampling_method=1) # reduce x1 by 273.15, otherwise values are too high + + inputs_tair_extended.extend([z, e]) + + # Create & process Training data + prep = PreprocessingSingleStep(inputs_tair_extended, output_tair, time_step=4) + td = prep.pipeline(file_path) + + # Build & train Classical ANN + m = ClassicalANNModel(epochs=1) + model = m.pipeline(td) + + # check correct sampling_method specification + assert x1.sampling_method == 'previous' and lx1[1].sampling_method == 'previous' + assert x2.sampling_method == 'current' and lx2.sampling_method == 'current' + assert FeatureConstruction.get_feature('weaSta_reaWeaHDirNor_y').sampling_method == 'mean_over_interval' + assert FeatureConstruction.get_feature('test_feature_two').sampling_method == 'mean_over_interval' + assert e.sampling_method == 'previous' + + FeatureConstruction.reset() @pytest.fixture(scope='module') @@ -194,6 +228,7 @@ def p_hp_data(file_path, inputs_php, output_php): td = prep.pipeline(file_path) return prep, td + @pytest.fixture(scope='module') def tair_data_delta(file_path, inputs_tair, output_tair): Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) @@ -208,6 +243,7 @@ def tair_data_delta(file_path, inputs_tair, output_tair): td = prep.pipeline(file_path) return prep, td + @pytest.fixture(scope='module') def tair_data_noval(file_path, inputs_tair, output_tair): Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) @@ -237,45 +273,6 @@ def tair_data_total(file_path, inputs_tair, output_tair): td = prep.pipeline(file_path) return prep, td -def test_sampling_methods(file_path, inputs_tair_extended, output_tair): - # Setup up logger for saving - Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) - - FeatureConstruction.set_default_sampling_method(0) - - # Create lags - x1 = Feature('reaTZon_y', sampling_method='previous') - lx1 = x1.lag(2) # reaTZon_y_lag1, reaTZon_y_lag2 - x2 = Feature('weaSta_reaWeaTDryBul_y') - lx2 = x2.lag(1) # weaSta_reaWeaTDryBul_y_lag1 - x3 = Feature('oveHeaPumY_u') - x3.lag(2) # oveHeaPumY_u_lag1, oveHeaPumY_u_lag2 - - # dummy Features - y = x1 + lx1[0] - z = y + x1 - z.rename('test_feature_two') - z.sampling_method = 'mean_over_interval' - e = FeatureExp(x1-273.15, 'exp', sampling_method=1) # reduce x1 by 273.15, otherwise values are too high - - inputs_tair_extended.extend([z, e]) - - # Create & process Training data - prep = PreprocessingSingleStep(inputs_tair_extended, output_tair, time_step=4) - td = prep.pipeline(file_path) - - # Build & train Classical ANN - m = ClassicalANNModel(epochs=1) - model = m.pipeline(td) - - # check correct sampling_method specification - assert x1.sampling_method == 'previous' and lx1[1].sampling_method == 'previous' - assert x2.sampling_method == 'current' and lx2.sampling_method == 'current' - assert FeatureConstruction.get_feature('weaSta_reaWeaHDirNor_y').sampling_method == 'mean_over_interval' - assert FeatureConstruction.get_feature('test_feature_two').sampling_method == 'mean_over_interval' - assert e.sampling_method == 'previous' - - FeatureConstruction.reset() def test_model_linReg(inputs_php, output_php, file_path): # Setup up logger for saving @@ -293,6 +290,7 @@ def test_model_linReg(inputs_php, output_php, file_path): Logger.log_setup(prep, m, save_name_model='model_linReg.json') Logger.save_training_data(td, path=os.path.join(Logger._logger, 'training_data2')) + def test_model_ann(p_hp_data, inputs_php, output_php, file_path): # Setup up logger for saving Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) @@ -311,6 +309,7 @@ def test_model_ann(p_hp_data, inputs_php, output_php, file_path): Logger.log_setup(None, m) Logger.save_training_data(td) + def test_model_cmnn(p_hp_data, inputs_php, output_php, file_path): # Setup up logger for saving Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) @@ -349,6 +348,7 @@ def test_model_cmnn(p_hp_data, inputs_php, output_php, file_path): Logger.log_setup(prep, m) Logger.save_training_data(td) + def test_model_linANN(p_hp_data, inputs_php, output_php, file_path): # Setup up logger for saving Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) @@ -374,6 +374,7 @@ def test_model_linANN(p_hp_data, inputs_php, output_php, file_path): Logger.log_setup(prep, m) Logger.save_training_data(td) + def test_model_pinn(inputs_php, output_php, file_path): # Setup up logger for saving Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) @@ -406,6 +407,7 @@ def test_model_pinn(inputs_php, output_php, file_path): Logger.log_setup(prep, m) Logger.save_training_data(td) + def test_models_rnn(file_path): Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) @@ -441,6 +443,7 @@ def test_models_rnn(file_path): m = RNNModel(epochs=1, rnn_layer='RNN', early_stopping_epochs=None) m.pipeline(td, save_model=False, plot=False) + def test_read_setup(): Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) @@ -476,6 +479,7 @@ def test_read_setup(): config_model = json.load(f) AbstractModel.model_from_config(config_model) + def test_feature_selection(monkeypatch, p_hp_data, file_path): # Setup up logger for saving Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) @@ -491,6 +495,7 @@ def test_feature_selection(monkeypatch, p_hp_data, file_path): recursive_feature_elimination_pipeline(file_path, prep, m, ascending_lag_order=True, fixed_inputs=['weaSta_reaWeaTDryBul_y', 'oveHeaPumY_u']) + def test_feature_selection_multi(monkeypatch, tair_data_delta, tair_data_noval ,tair_data_total, file_path): # Setup up logger for saving Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) @@ -522,4 +527,4 @@ def test_feature_selection_multi(monkeypatch, tair_data_delta, tair_data_noval , recursive_feature_elimination_pipeline(file_path, prep2, m, use_multi_step_error=False) m.pipeline(td2, save_model=False, plot=False) Logger.log_setup(prep, None) - Logger.save_training_data(td, path=os.path.join(Logger._logger, 'training_data2.json')) \ No newline at end of file + Logger.save_training_data(td, path=os.path.join(Logger._logger, 'training_data2.json')) From 8323ee50ede6e6faf522cab692253944cb8bfa07 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 10 Dec 2025 17:15:39 +0100 Subject: [PATCH 28/42] Fixed error: char not allowed in folder name --- physXAI/utils/logging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/physXAI/utils/logging.py b/physXAI/utils/logging.py index 7f873d7..61334b2 100644 --- a/physXAI/utils/logging.py +++ b/physXAI/utils/logging.py @@ -142,7 +142,7 @@ def setup_logger(folder_name: str = None, override: bool = False, base_path: str if base_path is None: base_path = Logger.base_path if folder_name is None: - folder_name = datetime.now().strftime("%d.%m.%y %H:%M:%S") + folder_name = datetime.now().strftime("%y-%m-%d %H.%M.%S") folder_name = os.path.join(base_path, folder_name) else: folder_name = os.path.join(base_path, folder_name) From 8fd0093e4da701f8f1524726b12a414b0847b522 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 10 Dec 2025 17:38:08 +0100 Subject: [PATCH 29/42] fixed bug with property & added testing for deprecated shift --- .../different_sampling_methods.py | 2 +- physXAI/preprocessing/constructed.py | 16 ++--- physXAI/preprocessing/preprocessing.py | 18 ++--- unittests/test_coverage.py | 66 +++++++++++++++---- 4 files changed, 72 insertions(+), 30 deletions(-) diff --git a/executables/bestest_hydronic_heat_pump/different_sampling_methods.py b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py index e292a6f..26c3f07 100644 --- a/executables/bestest_hydronic_heat_pump/different_sampling_methods.py +++ b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py @@ -53,7 +53,7 @@ y = x1 + lx1[0] z = y + x1 z.rename('example_feature_two') -z.sampling_method = 'mean_over_interval' +z.set_sampling_method('mean_over_interval') e = FeatureExp(x1-273.15, 'exp', sampling_method=1) # reduce x1 by 273.15, otherwise values are too high # add dummy features to inputs diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index 341f319..4ae2e9b 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -44,17 +44,17 @@ def __init__(self, name: str, sampling_method: Union[str, int] = None, **kwargs) """ self.feature: str = name - self.sampling_method = sampling_method + self._sampling_method = None + self.set_sampling_method(sampling_method) # Automatically registers the newly created feature instance with the FeatureConstruction manager FeatureConstruction.append(self) - @property - def sampling_method(self): + def get_sampling_method(self) -> str: + """returns the Features sampling method""" return self._sampling_method - @sampling_method.setter - def sampling_method(self, val: Union[str, int] = None): + def set_sampling_method(self, val: Union[str, int] = None): """ Sets the feature's sampling method. If None is given, FeatureConstruction._default_sampling_method is used Available methods: @@ -162,7 +162,7 @@ def get_config(self) -> dict: return { 'class_name': self.__class__.__name__, 'name': self.feature, - 'sampling_method': self.sampling_method, + 'sampling_method': self.get_sampling_method(), } @classmethod @@ -240,14 +240,14 @@ def __init__(self, f: Union[FeatureBase, str], lag: int, name: str = None, **kwa name = f.feature + f'_lag{lag}' # lags must have the same sampling_method as their base feature - sampling_method = f.sampling_method + sampling_method = f.get_sampling_method() else: self.origf: str = f if name is None: name = f + f'_lag{lag}' # lags must have the same sampling_method as their base feature - sampling_method = FeatureConstruction.get_feature(f).sampling_method + sampling_method = FeatureConstruction.get_feature(f).get_sampling_method() if 'sampling_method' in kwargs.keys(): assert kwargs['sampling_method'] == sampling_method, ( diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index 1b8a5ff..d45bee8 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -165,7 +165,7 @@ def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, lis ) FeatureConstruction.set_default_sampling_method(kwargs['shift']) for f in FeatureConstruction.features: - f.sampling_method = kwargs['shift'] + f.set_sampling_method(kwargs['shift']) super().__init__(inputs, output, time_step, test_size, val_size, random_state, time_index_col, csv_delimiter, csv_encoding, csv_header, csv_skiprows, ignore_nan, **kwargs) @@ -232,11 +232,11 @@ def pairwise(iterable: Iterable): X = df[inputs_without_lags].copy() - if all('current' == f.sampling_method for f in features_without_lags): + if all('current' == f.get_sampling_method() for f in features_without_lags): # filter / sample data X = self.sample_df_according_to_timestep(X) # nothing more to do here - elif all('previous' == f.sampling_method for f in features_without_lags): + elif all('previous' == f.get_sampling_method() for f in features_without_lags): # filter / sample data X = self.sample_df_according_to_timestep(X) @@ -244,7 +244,7 @@ def pairwise(iterable: Iterable): X = X.shift(1) y = y.iloc[1:] X = X.iloc[1:] - elif all('mean_over_interval' == f.sampling_method for f in features_without_lags): + elif all('mean_over_interval' == f.get_sampling_method() for f in features_without_lags): X = get_mean_over_interval(y, X) # synchronize length between X and y y = y.iloc[1:] @@ -255,7 +255,7 @@ def pairwise(iterable: Iterable): for f in features_without_lags: # only process inputs with sampling method mean_over_interval first since X cannot be sampled # to the actual required time steps until the intermediate values were taken into the mean - if f.sampling_method == 'mean_over_interval': + if f.get_sampling_method() == 'mean_over_interval': res.append(get_mean_over_interval(y, X[[f.feature]])) previous_or_mean_in_sampling_methods.append(True) @@ -264,20 +264,20 @@ def pairwise(iterable: Iterable): # process inputs with sampling methods 'current' and 'previous' for f in features_without_lags: _x = X[[f.feature]] - if f.sampling_method == 'current': + if f.get_sampling_method() == 'current': # no transformation needed res.append(_x) previous_or_mean_in_sampling_methods.append(False) - elif f.sampling_method == 'previous': + elif f.get_sampling_method() == 'previous': # shift by 1 _x = _x.shift(1) _x = _x.iloc[1:] res.append(_x) previous_or_mean_in_sampling_methods.append(True) - elif f.sampling_method == 'mean_over_interval': + elif f.get_sampling_method() == 'mean_over_interval': continue else: - raise NotImplementedError(f"Sampling method '{f.sampling_method}' not implemented.") + raise NotImplementedError(f"Sampling method '{f.get_sampling_method()}' not implemented.") X = pd.concat(res, axis=1) X = X.sort_index(ascending=True) diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index cd7a910..48e6133 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -125,7 +125,7 @@ def test_unsupported_type(self): FeatureConstruction.set_default_sampling_method(['current']) # test case: lags of the same input have mismatching sampling methods - def test_lag_with_mismatching_shifts(self): + def test_lag_with_mismatching_sampling_methods(self): x = Feature('test', sampling_method='current') with self.assertRaises(AssertionError): @@ -149,7 +149,7 @@ def test_sampling_method_use_default(file_path, inputs_tair, output_tair): for inp in inputs_tair: f = FeatureConstruction.get_feature(inp) - assert f.sampling_method == 'previous' + assert f.get_sampling_method() == 'previous' FeatureConstruction.reset() @@ -170,7 +170,7 @@ def test_sampling_method_str(file_path, inputs_tair, output_tair): for inp in inputs_tair: f = FeatureConstruction.get_feature(inp) - assert f.sampling_method == 'mean_over_interval' + assert f.get_sampling_method() == 'mean_over_interval' FeatureConstruction.reset() @@ -178,9 +178,6 @@ def test_sampling_method_str(file_path, inputs_tair, output_tair): def test_different_sampling_methods(file_path, inputs_tair_extended, output_tair): """test case: different sampling methods given""" - # Setup up logger for saving - Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) - # set default FeatureConstruction.set_default_sampling_method(0) @@ -196,7 +193,7 @@ def test_different_sampling_methods(file_path, inputs_tair_extended, output_tair y = x1 + lx1[0] z = y + x1 z.rename('test_feature_two') - z.sampling_method = 'mean_over_interval' + z.set_sampling_method('mean_over_interval') e = FeatureExp(x1 - 273.15, 'exp', sampling_method=1) # reduce x1 by 273.15, otherwise values are too high inputs_tair_extended.extend([z, e]) @@ -210,11 +207,11 @@ def test_different_sampling_methods(file_path, inputs_tair_extended, output_tair model = m.pipeline(td) # check correct sampling_method specification - assert x1.sampling_method == 'previous' and lx1[1].sampling_method == 'previous' - assert x2.sampling_method == 'current' and lx2.sampling_method == 'current' - assert FeatureConstruction.get_feature('weaSta_reaWeaHDirNor_y').sampling_method == 'mean_over_interval' - assert FeatureConstruction.get_feature('test_feature_two').sampling_method == 'mean_over_interval' - assert e.sampling_method == 'previous' + assert x1.get_sampling_method() == 'previous' and lx1[1].get_sampling_method() == 'previous' + assert x2.get_sampling_method() == 'current' and lx2.get_sampling_method() == 'current' + assert FeatureConstruction.get_feature('weaSta_reaWeaHDirNor_y').get_sampling_method() == 'mean_over_interval' + assert FeatureConstruction.get_feature('test_feature_two').get_sampling_method() == 'mean_over_interval' + assert e.get_sampling_method() == 'previous' FeatureConstruction.reset() @@ -310,6 +307,51 @@ def test_model_ann(p_hp_data, inputs_php, output_php, file_path): Logger.save_training_data(td) +def test_deprecated_shift(p_hp_data, inputs_php, output_php, file_path): + + # Setup up logger for saving + Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) + + # Create & process Training data + prep = PreprocessingSingleStep(inputs_php, output_php, shift=0) # deprecated shift given in preprocessing + td = prep.pipeline(file_path) + + m = ClassicalANNModel(epochs=1, n_neurons=[4, 4], n_layers=2, activation_function=['softplus', 'softplus'], + early_stopping_epochs=None, rescale_output=False) + m.pipeline(td) + + m.epochs = 1 + m.online_pipeline(td, os.path.join(Logger._logger, 'model.keras')) + + assert FeatureConstruction.get_default_sampling_method() == 'current' + FeatureConstruction.set_default_sampling_method('previous') # reset default sampling + + # from config + config_prep = { + "__class_name__": "PreprocessingSingleStep", + "inputs": [ + "oveHeaPumY_u", + "Func(logistic)", + "weaSta_reaWeaTDryBul_y", + "reaTZon_y" + ], + "output": [ + "reaPHeaPum_y" + ], + "shift": 0, # deprecated shift + "test_size": 0.1, + "val_size": 0.1, + "random_state": 42, + "time_step": 1.0, + } + + a = PreprocessingData.from_config(config_prep) + assert isinstance(a, PreprocessingSingleStep) + assert FeatureConstruction.get_default_sampling_method() == 'current' + + FeatureConstruction.reset() + + def test_model_cmnn(p_hp_data, inputs_php, output_php, file_path): # Setup up logger for saving Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) From 049676192254fe1ff792ad6f8c9d0951e2ea06eb Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 10 Dec 2025 17:39:23 +0100 Subject: [PATCH 30/42] Fixed small error in testing script --- physXAI/preprocessing/preprocessing.py | 13 ++++++++++++- unittests/test_coverage.py | 8 +++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index d45bee8..4a484d7 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -118,7 +118,18 @@ def get_config(self) -> dict: @classmethod @abstractmethod def from_config(cls, config: dict) -> 'PreprocessingData': - pass + + if "__class_name__" in config.keys(): + if config["__class_name__"] == 'PreprocessingSingleStep': + return PreprocessingSingleStep.from_config(config) + elif config["__class_name__"] == 'PreprocessingMultiStep': + return PreprocessingMultiStep.from_config(config) + else: + raise ValueError( + f"config does not contain a valid '__class_name__'. config['__class_name__'] is " + f"{config["__class_name__"]} but only 'PreprocessingSingleStep' or 'PreprocessingMultiStep' allowed.") + else: + raise ValueError("No valid config given. config does not contain key '__class_name__'") class PreprocessingSingleStep(PreprocessingData): diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index 48e6133..7a5b99c 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -462,7 +462,7 @@ def test_models_rnn(file_path): m = RNNModel(epochs=1, rnn_layer='LSTM', init_layer='dense') m.pipeline(td, os.path.join(Logger._logger, 'model2.keras')) - Logger.log_setup(td, m, 'preprocessing_config2.json', + Logger.log_setup(prep, m, 'preprocessing_config2.json', save_name_constructed='constructed_config2.json') Logger.save_training_data(td) @@ -495,13 +495,15 @@ def test_read_setup(): path = os.path.join(Logger._logger, save_name_preprocessing) with open(path, "r") as f: config_prep = json.load(f) - PreprocessingData.from_config(config_prep) + a = PreprocessingData.from_config(config_prep) + assert isinstance(a, PreprocessingSingleStep) save_name_preprocessing = 'preprocessing_config2.json' path = os.path.join(Logger._logger, save_name_preprocessing) with open(path, "r") as f: config_prep = json.load(f) - PreprocessingData.from_config(config_prep) + b = PreprocessingData.from_config(config_prep) + assert isinstance(b, PreprocessingMultiStep) save_name_constructed = Logger.save_name_constructed path = os.path.join(Logger._logger, save_name_constructed) From 03335f6ca94976ae9fed1f44bcb3e423f80e96ea Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 10 Dec 2025 20:19:23 +0100 Subject: [PATCH 31/42] fixed small syntax error with older python versions --- physXAI/preprocessing/preprocessing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index 4a484d7..9a39e38 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -120,14 +120,14 @@ def get_config(self) -> dict: def from_config(cls, config: dict) -> 'PreprocessingData': if "__class_name__" in config.keys(): - if config["__class_name__"] == 'PreprocessingSingleStep': + if config['__class_name__'] == 'PreprocessingSingleStep': return PreprocessingSingleStep.from_config(config) - elif config["__class_name__"] == 'PreprocessingMultiStep': + elif config['__class_name__'] == 'PreprocessingMultiStep': return PreprocessingMultiStep.from_config(config) else: raise ValueError( f"config does not contain a valid '__class_name__'. config['__class_name__'] is " - f"{config["__class_name__"]} but only 'PreprocessingSingleStep' or 'PreprocessingMultiStep' allowed.") + f"{config['__class_name__']} but only 'PreprocessingSingleStep' or 'PreprocessingMultiStep' allowed.") else: raise ValueError("No valid config given. config does not contain key '__class_name__'") From 9450d0fe57882c6eac463f8a28b0681c5b26f0f4 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 10 Dec 2025 20:49:24 +0100 Subject: [PATCH 32/42] Moved default_sampling_method from FeatureConstruction to Feature --- .../different_sampling_methods.py | 8 ++--- physXAI/preprocessing/constructed.py | 31 +++++++++++++------ physXAI/preprocessing/preprocessing.py | 10 +++--- unittests/test_coverage.py | 14 ++++----- 4 files changed, 37 insertions(+), 26 deletions(-) diff --git a/executables/bestest_hydronic_heat_pump/different_sampling_methods.py b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py index 26c3f07..84c5afa 100644 --- a/executables/bestest_hydronic_heat_pump/different_sampling_methods.py +++ b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py @@ -1,6 +1,6 @@ from physXAI.models.ann.ann_design import ClassicalANNModel from physXAI.preprocessing.preprocessing import PreprocessingSingleStep -from physXAI.preprocessing.constructed import Feature, FeatureConstruction, FeatureExp +from physXAI.preprocessing.constructed import Feature, FeatureExp from physXAI.utils.logging import Logger """ @@ -10,15 +10,15 @@ specified. sampling_method (Union[str, int]): Time step of the input data used to predict the output. - - if None: FeatureConstruction.get_default_sampling_method() is used + - if None: Feature.get_default_sampling_method() is used - if 'current' or 0: Current time step will be used for prediction. - if 'previous' or 1: Previous time step will be used for prediction. - if 'mean_over_interval': Mean between current and previous time step will be used. - Specify default sampling method using FeatureConstruction.set_default_sampling_method(). + Specify default sampling method using Feature.set_default_sampling_method(). If no default sampling method is specified by the user, 'previous' is used as default. """ -FeatureConstruction.set_default_sampling_method(0) +Feature.set_default_sampling_method(0) # Setup up logger for saving Logger.setup_logger(folder_name='different_sampling_methods_ann', override=True) diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index 4ae2e9b..2683acf 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -36,7 +36,7 @@ def __init__(self, name: str, sampling_method: Union[str, int] = None, **kwargs) Args: name (str): The name of the feature. This will be the column name in the DataFrame. sampling_method (Union[str, int]): Time step of the input data used to predict the output. - - if None: FeatureConstruction._default_sampling_method is used + - if None: Feature._default_sampling_method is used - if 'current' or 0: Current time step will be used for prediction. - if 'previous' or 1: Previous time step will be used for prediction. - if 'mean_over_interval': Mean between current and previous time step will be used. @@ -56,7 +56,7 @@ def get_sampling_method(self) -> str: def set_sampling_method(self, val: Union[str, int] = None): """ - Sets the feature's sampling method. If None is given, FeatureConstruction._default_sampling_method is used + Sets the feature's sampling method. If None is given, Feature._default_sampling_method is used Available methods: - 'current' or 0: Current time step will be used for prediction. - 'previous' or 1: Previous time step will be used for prediction. @@ -64,7 +64,7 @@ def set_sampling_method(self, val: Union[str, int] = None): """ if val is None: - self._sampling_method = FeatureConstruction.get_default_sampling_method() + self._sampling_method = Feature.get_default_sampling_method() else: self._sampling_method = _return_valid_sampling_method(val) @@ -213,7 +213,22 @@ class Feature(FeatureBase): Represents a basic feature that is assumed to exist directly in the input DataFrame. Its `process` method simply retrieves the column by its name. """ - pass + + _default_sampling_method = 'previous' + + @classmethod + def get_default_sampling_method(cls): + return Feature._default_sampling_method + + @classmethod + def set_default_sampling_method(cls, val: Union[str, int]): + """ + Sets the default sampling method for all features that do not have a custom sampling method. Available methods: + - 'current' or 0: Current time step will be used for prediction. + - 'previous' or 1: Previous time step will be used for prediction. + - 'mean_over_interval': Mean between current and previous time step will be used. + """ + Feature._default_sampling_method = _return_valid_sampling_method(val) @register_feature @@ -590,10 +605,6 @@ class FeatureConstruction: features = list[FeatureBase]() _default_sampling_method = 'previous' - @staticmethod - def get_default_sampling_method(): - return FeatureConstruction._default_sampling_method - @staticmethod def set_default_sampling_method(val: Union[str, int]): """ @@ -606,9 +617,9 @@ def set_default_sampling_method(val: Union[str, int]): @staticmethod def reset(): - """Clears all registered features and input names.""" + """Clears all registered features and input names. Furthermore, resets the default sampling method""" FeatureConstruction.features = list[FeatureBase]() - FeatureConstruction.set_default_sampling_method('previous') + Feature.set_default_sampling_method('previous') @staticmethod def append(f: FeatureBase): diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index 9a39e38..49f8dbe 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -5,7 +5,7 @@ import pandas as pd import itertools from sklearn.model_selection import train_test_split -from physXAI.preprocessing.constructed import FeatureConstruction, FeatureBase +from physXAI.preprocessing.constructed import FeatureConstruction, FeatureBase, Feature from physXAI.preprocessing.training_data import TrainingData, TrainingDataMultiStep, TrainingDataGeneric from physXAI.utils.logging import get_full_path os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' @@ -167,14 +167,14 @@ def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, lis DeprecationWarning( "shift parameter is deprecated for SingleStep models and replaced by sampling_method, an attribute of " "each Feature. This allows specifying individual 'shifts' for each Feature / input. A default sampling" - "method can be specified via FeatureConstruction.set_default_sampling_method()." + "method can be specified via Feature.set_default_sampling_method()." ) DeprecationWarning( - f"shift parameter was given as shift={kwargs['shift']}. Setting FeatureConstruction.set_default_" - f"sampling_method(shift) and override possible individual sampling methods of all Features. If this is" + f"shift parameter was given as shift={kwargs['shift']}. Setting Feature.set_default_sampling_method" + f"(shift) and overriding possible individual sampling methods of all Features. If this is" f"not intended, remove shift parameter when initializing PreprocessingSingleStep object!" ) - FeatureConstruction.set_default_sampling_method(kwargs['shift']) + Feature.set_default_sampling_method(kwargs['shift']) for f in FeatureConstruction.features: f.set_sampling_method(kwargs['shift']) diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index 7a5b99c..3c0f074 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -117,12 +117,12 @@ class TestSamplingMethodsFaults(TestCase): # test case: unsupported str given as sampling method def test_unsupported_str(self): with self.assertRaises(ValueError): - FeatureConstruction.set_default_sampling_method('test') + Feature.set_default_sampling_method('test') # test case: unsupported type given for sampling method def test_unsupported_type(self): with self.assertRaises(TypeError): - FeatureConstruction.set_default_sampling_method(['current']) + Feature.set_default_sampling_method(['current']) # test case: lags of the same input have mismatching sampling methods def test_lag_with_mismatching_sampling_methods(self): @@ -157,7 +157,7 @@ def test_sampling_method_use_default(file_path, inputs_tair, output_tair): def test_sampling_method_str(file_path, inputs_tair, output_tair): """test case: set default using str (setting default with int is done in test_different_sampling_methods)""" - FeatureConstruction.set_default_sampling_method('mean_over_interval') + Feature.set_default_sampling_method('mean_over_interval') x = Feature('oveHeaPumY_u') x.lag(1) @@ -179,7 +179,7 @@ def test_different_sampling_methods(file_path, inputs_tair_extended, output_tair """test case: different sampling methods given""" # set default - FeatureConstruction.set_default_sampling_method(0) + Feature.set_default_sampling_method(0) # Create lags x1 = Feature('reaTZon_y', sampling_method='previous') @@ -323,8 +323,8 @@ def test_deprecated_shift(p_hp_data, inputs_php, output_php, file_path): m.epochs = 1 m.online_pipeline(td, os.path.join(Logger._logger, 'model.keras')) - assert FeatureConstruction.get_default_sampling_method() == 'current' - FeatureConstruction.set_default_sampling_method('previous') # reset default sampling + assert Feature.get_default_sampling_method() == 'current' + Feature.set_default_sampling_method('previous') # reset default sampling # from config config_prep = { @@ -347,7 +347,7 @@ def test_deprecated_shift(p_hp_data, inputs_php, output_php, file_path): a = PreprocessingData.from_config(config_prep) assert isinstance(a, PreprocessingSingleStep) - assert FeatureConstruction.get_default_sampling_method() == 'current' + assert Feature.get_default_sampling_method() == 'current' FeatureConstruction.reset() From 93068a00beac8b9fb5225e2399c3f9f94da63a61 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Thu, 11 Dec 2025 16:42:31 +0100 Subject: [PATCH 33/42] Deleted deprecated code --- physXAI/preprocessing/constructed.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index 2683acf..a3045ed 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -603,17 +603,6 @@ class FeatureConstruction: """ features = list[FeatureBase]() - _default_sampling_method = 'previous' - - @staticmethod - def set_default_sampling_method(val: Union[str, int]): - """ - Sets the default sampling method for all features that do not have a custom sampling method. Available methods: - - 'current' or 0: Current time step will be used for prediction. - - 'previous' or 1: Previous time step will be used for prediction. - - 'mean_over_interval': Mean between current and previous time step will be used. - """ - FeatureConstruction._default_sampling_method = _return_valid_sampling_method(val) @staticmethod def reset(): From 096a863276ee350acc6ee71ceab3da2679f31ebb Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 17 Dec 2025 15:58:59 +0100 Subject: [PATCH 34/42] Implemented handling of constructed outputs --- .../different_sampling_methods.py | 6 +- physXAI/preprocessing/constructed.py | 40 +++- physXAI/preprocessing/preprocessing.py | 217 ++++++++++-------- unittests/test_coverage.py | 16 +- 4 files changed, 163 insertions(+), 116 deletions(-) diff --git a/executables/bestest_hydronic_heat_pump/different_sampling_methods.py b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py index 84c5afa..5ff4676 100644 --- a/executables/bestest_hydronic_heat_pump/different_sampling_methods.py +++ b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py @@ -31,7 +31,7 @@ Feature('weaSta_reaWeaHDirNor_y', sampling_method='mean_over_interval'), 'oveHeaPumY_u', 'oveHeaPumY_u_lag1', 'oveHeaPumY_u_lag2'] # Output feature -output = 'Change(T_zone)' +output = ['Change(t_air)'] """ The constructed features are automatically added to the data via 'physXAI.preprocessing.constructed.py' @@ -53,9 +53,11 @@ y = x1 + lx1[0] z = y + x1 z.rename('example_feature_two') -z.set_sampling_method('mean_over_interval') e = FeatureExp(x1-273.15, 'exp', sampling_method=1) # reduce x1 by 273.15, otherwise values are too high +change_tair = x1 - lx1[0] +change_tair.rename('Change(t_air)') + # add dummy features to inputs inputs.extend([z, e]) diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index a3045ed..eb2a45e 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -14,12 +14,13 @@ def _return_valid_sampling_method(v: Union[int, str]): return 'current' elif v in ['previous', 1]: return 'previous' - elif v == 'mean_over_interval': - return 'mean_over_interval' + elif v in ['mean_over_interval', '_']: + return v else: raise ValueError( f"Value of sampling method not supported, value is: {v}. Sampling method must be 'current' " - f"(or 0 if s is int), 'previous' (or 1 if s is int) or 'mean_over_interval'.") + f"(or 0 if sampling_method is int), 'previous' (or 1 if sampling_method is int) or 'mean_over_interval'. " + f"In case of deactivated sampling (for outputs), sampling_method must be '_'.") class FeatureBase(ABC): @@ -680,12 +681,39 @@ def recursive_search(feature): return res @staticmethod - def process_inputs(inputs: list[Union[str, FeatureBase]]) -> list[str]: + def get_constructed_features(l: list[str] = None) -> list[str]: + """ + returns a list of the names of all constructed features (features that have a type other than 'Feature') + - within the given list or + - of all constructed features if list is None + + Args: + l (list[str]): list of feature names to search in + + Returns: + list[str]: the list of the names of the constructed features + """ + + # if no list is given, search in all features + if not l: + l = FeatureConstruction.features + + res = list() + for f in FeatureConstruction.features: + if not isinstance(f, Feature) and (f.feature in l): + res.append(f.feature) # name of the feature + + return res + + @staticmethod + def create_features(inputs: list[Union[str, FeatureBase]], no_sampling_method: bool = False) -> list[str]: """ Creates a Feature for all inputs that are not yet created as features Args: inputs (list(Union[str, FeatureBase])): List of column names or Features to be used as input features. + no_sampling_method (bool): deactivate sampling_method for outputs, default = False. + If deactivated, sampling_method will be set to '_' Returns: list[str]: list of column names of all input features @@ -696,11 +724,15 @@ def process_inputs(inputs: list[Union[str, FeatureBase]]) -> list[str]: for inp in inputs: if isinstance(inp, FeatureBase): input_str.append(inp.feature) # get name of feature (which is used as column name) + if no_sampling_method: + inp.set_sampling_method('_') elif isinstance(inp, str): input_str.append(inp) # check if a Feature with the given name (inp) was already created, otherwise create it if not any(inp == f.feature for f in FeatureConstruction.features): Feature(name=inp) + if no_sampling_method: + FeatureConstruction.get_feature(inp).set_sampling_method('_') else: raise TypeError(f"Only inputs with types 'str' or 'FeatureBase' allowed, got type {type(inp)} instead") diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index 49f8dbe..592bb0f 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -5,7 +5,7 @@ import pandas as pd import itertools from sklearn.model_selection import train_test_split -from physXAI.preprocessing.constructed import FeatureConstruction, FeatureBase, Feature +from physXAI.preprocessing.constructed import FeatureConstruction, FeatureBase, Feature, FeatureTwo from physXAI.preprocessing.training_data import TrainingData, TrainingDataMultiStep, TrainingDataGeneric from physXAI.utils.logging import get_full_path os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' @@ -46,10 +46,11 @@ def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, lis self.csv_header = csv_header self.csv_skiprows = csv_skiprows - self.inputs: list[str] = FeatureConstruction.process_inputs(inputs) + self.inputs: list[str] = FeatureConstruction.create_features(inputs) if isinstance(output, str): output = [output] - self.output: list[str] = output + # outputs shouldn't have any sampling method + self.output: list[str] = FeatureConstruction.create_features(output, no_sampling_method=True) self.time_step = time_step @@ -197,18 +198,18 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: and target (y) DataFrames. """ - # extract the names of all features in inputs and outputs that are based on lagged features - lag_based_features = FeatureConstruction.get_features_including_lagged_features(self.inputs + self.output) + # extract the names of all constructed features + constructed_input_features = FeatureConstruction.get_constructed_features(self.inputs) + constructed_output_features = FeatureConstruction.get_constructed_features(self.output) - inputs_without_lags = [inp for inp in self.inputs if inp not in lag_based_features] + # Only apply sampling method to those features which are not constructed features + # but which data is taken directly from the data frame + inputs_without_constructed = [inp for inp in self.inputs if inp not in constructed_input_features] + output_without_constructed = [out for out in self.output if out not in constructed_output_features] - # Applies feature constructions defined in `FeatureConstruction`. - # Only apply for those features that are not lags since lags must be constructed after sampling the data - # according to the given time step - FeatureConstruction.process(df, feature_names=inputs_without_lags + [out for out in self.output if out not in inputs_without_lags]) - features_without_lags: list[FeatureBase] = [FeatureConstruction.get_feature(inp) for inp in inputs_without_lags] + features_without_constructed: list[FeatureBase] = [FeatureConstruction.get_feature(inp) for inp in inputs_without_constructed] - df = df[inputs_without_lags + [out for out in self.output if out not in inputs_without_lags]] + df = df[inputs_without_constructed + output_without_constructed] # Nan handling in first and last rows non_nan_rows = df.notna().all(axis=1) @@ -216,103 +217,93 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: last_valid_index = non_nan_rows.iloc[::-1].idxmax() if non_nan_rows.any() else None df = df.loc[first_valid_index:last_valid_index] - def get_mean_over_interval(y: pd.DataFrame, x: pd.DataFrame): - """return mean values of x on target sampling (index of y)""" - def pairwise(iterable: Iterable): - "s -> (s0,s1), (s1,s2), (s2, s3), ..." - a, b = itertools.tee(iterable) - next(b, None) - return zip(a, b) - - original_grid = np.array(x.index) - results = [] - for i, j in pairwise(y.index): # output interval is target grid - slicer = np.logical_and(original_grid >= i, original_grid < j) - d = {'Index': j} - for inp in x.columns: - d[inp] = x[inp][slicer].mean() - results.append(d) - - x = pd.DataFrame(results).set_index('Index') - - return x - - # output is independent of sampling of inputs -> sample according to time step already - y = df[self.output].copy() - y = self.sample_df_according_to_timestep(y) - - X = df[inputs_without_lags].copy() - - if all('current' == f.get_sampling_method() for f in features_without_lags): - # filter / sample data - X = self.sample_df_according_to_timestep(X) - # nothing more to do here - elif all('previous' == f.get_sampling_method() for f in features_without_lags): - # filter / sample data - X = self.sample_df_according_to_timestep(X) - - # shift data by 1 and shorten DataFrames accordingly - X = X.shift(1) - y = y.iloc[1:] - X = X.iloc[1:] - elif all('mean_over_interval' == f.get_sampling_method() for f in features_without_lags): - X = get_mean_over_interval(y, X) - # synchronize length between X and y - y = y.iloc[1:] - - else: # different inputs have different sampling methods - res = [] - previous_or_mean_in_sampling_methods = [] - for f in features_without_lags: - # only process inputs with sampling method mean_over_interval first since X cannot be sampled - # to the actual required time steps until the intermediate values were taken into the mean - if f.get_sampling_method() == 'mean_over_interval': - res.append(get_mean_over_interval(y, X[[f.feature]])) - previous_or_mean_in_sampling_methods.append(True) - - # sample X according to required time step - X = self.sample_df_according_to_timestep(X) - # process inputs with sampling methods 'current' and 'previous' - for f in features_without_lags: - _x = X[[f.feature]] - if f.get_sampling_method() == 'current': - # no transformation needed - res.append(_x) - previous_or_mean_in_sampling_methods.append(False) - elif f.get_sampling_method() == 'previous': - # shift by 1 - _x = _x.shift(1) - _x = _x.iloc[1:] - res.append(_x) - previous_or_mean_in_sampling_methods.append(True) - elif f.get_sampling_method() == 'mean_over_interval': - continue - else: - raise NotImplementedError(f"Sampling method '{f.get_sampling_method()}' not implemented.") - - X = pd.concat(res, axis=1) - X = X.sort_index(ascending=True) + # sample input data; different inputs can have different sampling methods + res = [] + previous_or_mean_in_sampling_methods = [] + X = df[inputs_without_constructed].copy() + target_grid = self.sample_df_according_to_timestep(df).index + for f in features_without_constructed: + # only process inputs with sampling method mean_over_interval first since X cannot be sampled + # to the actual required time steps until the intermediate values were taken into the mean + if f.get_sampling_method() == 'mean_over_interval': + res.append(get_mean_over_interval(X[[f.feature]], target_grid)) + previous_or_mean_in_sampling_methods.append(True) + + # sample X according to required time step + X = self.sample_df_according_to_timestep(X) + # process inputs with sampling methods 'current' and 'previous' + for f in features_without_constructed: + _x = X[[f.feature]] + if f.get_sampling_method() == 'current': + # no transformation needed + res.append(_x) + previous_or_mean_in_sampling_methods.append(False) + elif f.get_sampling_method() == 'previous': + # shift by 1 + _x = _x.shift(1) + _x = _x.iloc[1:] + res.append(_x) + previous_or_mean_in_sampling_methods.append(True) + elif f.get_sampling_method() == 'mean_over_interval': + continue + else: + raise NotImplementedError(f"Sampling method '{f.get_sampling_method()}' not implemented.") + # concatenate sampled input data + X = pd.concat(res, axis=1) + X = X.sort_index(ascending=True) + + # Sampling methods 'previous' and 'mean_over_interval' reduce available data points by 1. + if any(previous_or_mean_in_sampling_methods): + # if at least one of the features uses 'current' as sampling method, shorten X + if not all(previous_or_mean_in_sampling_methods): + X = X.iloc[1:] + + if X.isnull().values.any(): + if self.ignore_nan: + X.dropna(inplace=True) + else: + raise ValueError( + "Data Error: The TrainingData contains NaN values in intermediate rows. If this is intended, set " + "ignore_nan=True in PreprocessingSingleStep.") + # sample output data + if len(output_without_constructed) != 0: # at least one non-constructed output feature + y = df[output_without_constructed].copy() + y = self.sample_df_according_to_timestep(y) # Sampling methods 'previous' and 'mean_over_interval' reduce available data points by 1. - # Therefore, lengths of X and y have to be synchronized + # synchronize length of X and y if any(previous_or_mean_in_sampling_methods): y = y.iloc[1:] - # if at least one of the features uses 'current' as sampling method, shorten X - if not all(previous_or_mean_in_sampling_methods): - X = X.iloc[1:] + if y.isnull().values.any(): + if self.ignore_nan: + y.dropna(inplace=True) + else: + raise ValueError( + "Data Error: The TrainingData contains NaN values in intermediate rows. If this is intended," + "set ignore_nan=True in PreprocessingSingleStep.") - res_df = pd.concat([X, y], axis=1) + res_df = pd.concat([X, y], axis=1) - if res_df.isnull().values.any(): - if self.ignore_nan: - res_df.dropna(inplace=True) - else: - raise ValueError( - "Data Error: The TrainingData contains NaN values in intermediate rows. If this is intended, set " - "ignore_nan=True in PreprocessingSingleStep.") + else: # only constructed outputs + res_df = X + + # Applies feature constructions defined in `FeatureConstruction` + FeatureConstruction.process(res_df, feature_names=constructed_input_features + constructed_output_features) + + # assume constructed outputs solely base on features with sampling current or sampling previous / mean_over_interval - # Applies feature constructions defined in `FeatureConstruction` to the lagged inputs - FeatureConstruction.process(res_df, feature_names=lag_based_features) + if any(previous_or_mean_in_sampling_methods): + methods = ['previous', 'mean_over_interval'] + # if constructed output features are based on input features with sampling previous or mean_over_interval, + # the constructed output has to be shifted to invert the shift of the input features + for out in constructed_output_features: + out_feature = FeatureConstruction.get_feature(out) + if isinstance(out_feature, FeatureTwo): + if out_feature.feature1.get_sampling_method() in methods or out_feature.feature2.get_sampling_method() in methods: + res_df[out_feature.feature] = res_df[out_feature.feature].shift(-1) # shift + else: # constructed feature that doesn't consist of two features + if out_feature.f1.get_sampling_method() in methods: + res_df[out_feature.feature] = res_df[out_feature.feature].shift(-1) # shift # drop NaNs occurring due to creation of lags res_df.dropna(inplace=True) @@ -388,6 +379,28 @@ def from_config(cls, config: dict) -> 'PreprocessingSingleStep': return cls(**config) +def get_mean_over_interval(x: pd.DataFrame, target_grid: pd.DataFrame.index) -> pd.DataFrame: + """samples and returns x on target grid taking the mean over the interval (between the grid indices)""" + def pairwise(iterable: Iterable): + "s -> (s0,s1), (s1,s2), (s2, s3), ..." + a, b = itertools.tee(iterable) + next(b, None) + return zip(a, b) + + original_grid = np.array(x.index) + results = [] + for i, j in pairwise(target_grid): + slicer = np.logical_and(original_grid >= i, original_grid < j) + d = {'Index': j} + for inp in x.columns: + d[inp] = x[inp][slicer].mean() + results.append(d) + + x = pd.DataFrame(results).set_index('Index') + + return x + + class PreprocessingMultiStep(PreprocessingData): """ Handles preprocessing for multi-step forecasting models, typically RNNs. diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index 3c0f074..444c1a2 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -145,8 +145,6 @@ def test_sampling_method_use_default(file_path, inputs_tair, output_tair): prep = PreprocessingSingleStep(inputs_tair, output_tair) td = prep.pipeline(file_path) - assert len(inputs_tair) == len(FeatureConstruction.features) - for inp in inputs_tair: f = FeatureConstruction.get_feature(inp) assert f.get_sampling_method() == 'previous' @@ -166,8 +164,6 @@ def test_sampling_method_str(file_path, inputs_tair, output_tair): prep = PreprocessingSingleStep(inputs_tair, output_tair, time_step=4) td = prep.pipeline(file_path) - assert len(inputs_tair) == len(FeatureConstruction.features) - for inp in inputs_tair: f = FeatureConstruction.get_feature(inp) assert f.get_sampling_method() == 'mean_over_interval' @@ -175,7 +171,7 @@ def test_sampling_method_str(file_path, inputs_tair, output_tair): FeatureConstruction.reset() -def test_different_sampling_methods(file_path, inputs_tair_extended, output_tair): +def test_different_sampling_methods(file_path, inputs_tair_extended): """test case: different sampling methods given""" # set default @@ -193,13 +189,16 @@ def test_different_sampling_methods(file_path, inputs_tair_extended, output_tair y = x1 + lx1[0] z = y + x1 z.rename('test_feature_two') - z.set_sampling_method('mean_over_interval') e = FeatureExp(x1 - 273.15, 'exp', sampling_method=1) # reduce x1 by 273.15, otherwise values are too high inputs_tair_extended.extend([z, e]) + # output + change_tair = x1 - lx1[0] + change_tair.rename('Change(t_air)') + # Create & process Training data - prep = PreprocessingSingleStep(inputs_tair_extended, output_tair, time_step=4) + prep = PreprocessingSingleStep(inputs_tair_extended, [change_tair], time_step=4) td = prep.pipeline(file_path) # Build & train Classical ANN @@ -210,8 +209,9 @@ def test_different_sampling_methods(file_path, inputs_tair_extended, output_tair assert x1.get_sampling_method() == 'previous' and lx1[1].get_sampling_method() == 'previous' assert x2.get_sampling_method() == 'current' and lx2.get_sampling_method() == 'current' assert FeatureConstruction.get_feature('weaSta_reaWeaHDirNor_y').get_sampling_method() == 'mean_over_interval' - assert FeatureConstruction.get_feature('test_feature_two').get_sampling_method() == 'mean_over_interval' + assert FeatureConstruction.get_feature('test_feature_two').get_sampling_method() == 'current' assert e.get_sampling_method() == 'previous' + assert change_tair.get_sampling_method() == '_' FeatureConstruction.reset() From 203c601e69ddab5e2a98a476b07ea143040775ae Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 17 Dec 2025 15:05:27 +0000 Subject: [PATCH 35/42] Update coverage badge [skip ci] --- build/reports/coverage.svg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/reports/coverage.svg b/build/reports/coverage.svg index 1c7007c..b3e8ba0 100644 --- a/build/reports/coverage.svg +++ b/build/reports/coverage.svg @@ -15,7 +15,7 @@ coverage coverage - 89% - 89% + 88% + 88% From bafeb3cb9812dd9bb87b9f4b0eb7fe4a399d0f8e Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Thu, 18 Dec 2025 00:22:29 +0100 Subject: [PATCH 36/42] restructured sampling_method sampling_method of constructed features determined based on corresponding base_features(s) --- .../different_sampling_methods.py | 17 ++-- physXAI/preprocessing/constructed.py | 85 +++++++++++++++---- unittests/test_coverage.py | 12 ++- 3 files changed, 84 insertions(+), 30 deletions(-) diff --git a/executables/bestest_hydronic_heat_pump/different_sampling_methods.py b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py index 5ff4676..4048da2 100644 --- a/executables/bestest_hydronic_heat_pump/different_sampling_methods.py +++ b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py @@ -6,8 +6,8 @@ """ This script demonstrates the usage of different sampling methods. It is not physically meaningful. -When creating a Feature (or any subclass of FeatureBase like FeatureLag, FeatureAdd etc.), a sampling method can be -specified. +When creating a Feature, a sampling method can be specified. +For constructed features, no sampling method is necessary. It is assigned based on their corresponding base feature(s) sampling_method (Union[str, int]): Time step of the input data used to predict the output. - if None: Feature.get_default_sampling_method() is used @@ -30,8 +30,8 @@ inputs = ['reaTZon_y', 'reaTZon_y_lag1', 'reaTZon_y_lag2', 'weaSta_reaWeaTDryBul_y', 'weaSta_reaWeaTDryBul_y_lag1', Feature('weaSta_reaWeaHDirNor_y', sampling_method='mean_over_interval'), 'oveHeaPumY_u', 'oveHeaPumY_u_lag1', 'oveHeaPumY_u_lag2'] -# Output feature -output = ['Change(t_air)'] +# Output feature. Can include names of constructed features as well +output = ['Change(T_air)'] """ The constructed features are automatically added to the data via 'physXAI.preprocessing.constructed.py' @@ -52,14 +52,13 @@ # dummy Features y = x1 + lx1[0] z = y + x1 -z.rename('example_feature_two') +z.rename('example_feature_two') # since z is a constructed feature based on x1, its sampling_method will be previous e = FeatureExp(x1-273.15, 'exp', sampling_method=1) # reduce x1 by 273.15, otherwise values are too high +inputs.extend([z, e]) # add dummy features to inputs +# construct output change_tair = x1 - lx1[0] -change_tair.rename('Change(t_air)') - -# add dummy features to inputs -inputs.extend([z, e]) +change_tair.rename('Change(T_air)') # Create Training data # Time step defines target sampling: if original sampling of data is in 15min intervals, it is resampled to 1h intervals diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index eb2a45e..69c1ed5 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -204,6 +204,8 @@ def feature_from_config(item_conf: dict) -> 'FeatureBase': """ class_name = item_conf['class_name'] feature_class = CONSTRUCTED_CLASS_REGISTRY[class_name] + if 'sampling_method' in item_conf.keys() and item_conf['sampling_method'] == '_': + item_conf['ignore_sampling_for_output'] = True f1f = feature_class.from_config(item_conf) return f1f @@ -232,6 +234,52 @@ def set_default_sampling_method(cls, val: Union[str, int]): Feature._default_sampling_method = _return_valid_sampling_method(val) +def get_sampling_from_base(base_features: Union[FeatureBase, list[FeatureBase]], **kwargs) -> [str, list]: + """ + Returns the appropriate sampling_method for a constructed feature based on its base feature(s) + + Args: + base_features (Union[FeatureBase, list[FeatureBase]]): single base feature or list of max. two base features + **kwargs: additional keyword arguments. If sampling_method is given in kwargs as well, its validity is checked + + Returns: + sampling_method (str): sampling method + kwargs: kwargs which does not contain the key 'sampling_method' (anymore) + """ + + if not isinstance(base_features, list): + base_features = [base_features] + + assert len(base_features) <= 2, f'Expected a maximum of two features, got {len(base_features)} instead' + + sampling = [] + for f in base_features: + if isinstance(f, FeatureBase): + sampling.append(f.get_sampling_method()) + elif isinstance(f, (int, float)): # FeatureTwo can be built with int or float values + continue + else: + raise ValueError(f"Expected type [FeatureBase, int, float], got type {type(f)} instead") + + if len(sampling) > 1: + assert len(set(sampling)) == 1, f'Sampling methods of base feature are not equal, got {sampling}' + + sampling_method = sampling[0] + + if 'sampling_method' in kwargs.keys(): + if 'ignore_sampling_for_output' in kwargs.keys() and kwargs['ignore_sampling_for_output']: + # necessary for feature construction from config + sampling_method = '_' + else: + assert _return_valid_sampling_method(kwargs['sampling_method']) == sampling_method, ( + f"Constructed features must have the same sampling method as their base feature(s). Sampling method of " + f"base feature(s) is {sampling_method} but {kwargs['sampling_method']} was given as sampling method." + ) + kwargs.__delitem__('sampling_method') # constructor must not get more than one arg with the same key + + return sampling_method, kwargs + + @register_feature class FeatureLag(FeatureBase): """ @@ -254,23 +302,13 @@ def __init__(self, f: Union[FeatureBase, str], lag: int, name: str = None, **kwa self.origf: str = f.feature if name is None: name = f.feature + f'_lag{lag}' - - # lags must have the same sampling_method as their base feature - sampling_method = f.get_sampling_method() else: self.origf: str = f if name is None: name = f + f'_lag{lag}' - # lags must have the same sampling_method as their base feature - sampling_method = FeatureConstruction.get_feature(f).get_sampling_method() - - if 'sampling_method' in kwargs.keys(): - assert kwargs['sampling_method'] == sampling_method, ( - f"lags must have the same sampling method as their base feature. Sampling method of base feature is" - f" {sampling_method} but for lag {kwargs['sampling_method']} was given as sampling method." - ) - kwargs.__delitem__('sampling_method') # constructor must not get more than one arg with the same key + # lags must have the same sampling_method as their base feature + sampling_method, kwargs = get_sampling_from_base(FeatureConstruction.get_feature(self.origf), **kwargs) super().__init__(name, sampling_method=sampling_method, **kwargs) self.lag: int = lag @@ -292,8 +330,8 @@ class FeatureTwo(FeatureBase, ABC): Examples: FeatureAdd (f1 + f2), FeatureSub (f1 - f2). """ - def __init__(self, feature1: Union[FeatureBase, int, float], feature2: Union[FeatureBase, int, float], name: str = None, - **kwargs): + def __init__(self, feature1: Union[FeatureBase, int, float], feature2: Union[FeatureBase, int, float], + name: str = None, **kwargs): """ Initializes a FeatureTwo instance. @@ -315,7 +353,10 @@ def __init__(self, feature1: Union[FeatureBase, int, float], feature2: Union[Fea f2n = str(feature2) if name is None: name = self.name(f1n, f2n) - super().__init__(name, **kwargs) + + # constructed features must have the same sampling_method as their base features + sampling_method, kwargs = get_sampling_from_base([feature1, feature2], **kwargs) + super().__init__(name, sampling_method=sampling_method, **kwargs) self.feature1 = feature1 self.feature2 = feature2 @@ -493,7 +534,9 @@ def __init__(self, f1: FeatureBase, name: str = None, **kwargs): self.f1: FeatureBase = f1 if name is None: name = 'exp(' + f1.feature + ')' - super().__init__(name, **kwargs) + # constructed features must have the same sampling_method as their base features + sampling_method, kwargs = get_sampling_from_base(f1, **kwargs) + super().__init__(name, sampling_method=sampling_method, **kwargs) def process(self, df: DataFrame) -> Series: if self.feature not in df.columns: @@ -523,7 +566,9 @@ def __init__(self, f1: FeatureBase, name: str = None, **kwargs): self.f1: FeatureBase = f1 if name is None: name = 'sin(' + f1.feature + ')' - super().__init__(name, **kwargs) + # constructed features must have the same sampling_method as their base features + sampling_method, kwargs = get_sampling_from_base(f1, **kwargs) + super().__init__(name, sampling_method=sampling_method, **kwargs) def process(self, df: DataFrame) -> Series: if self.feature not in df.columns: @@ -553,7 +598,9 @@ def __init__(self, f1: FeatureBase, name: str = None, **kwargs): self.f1: FeatureBase = f1 if name is None: name = 'cos(' + f1.feature + ')' - super().__init__(name, **kwargs) + # constructed features must have the same sampling_method as their base features + sampling_method, kwargs = get_sampling_from_base(f1, **kwargs) + super().__init__(name, sampling_method=sampling_method, **kwargs) def process(self, df: DataFrame) -> Series: if self.feature not in df.columns: @@ -583,6 +630,8 @@ class FeatureConstant(FeatureBase): def __init__(self, c: float, name: str, **kwargs): self.c = c + if 'sampling_method' in kwargs.keys(): + UserWarning(f"Using 'sampling_method' for {self.__class__} does not have any effect.") super().__init__(name, **kwargs) def process(self, df: DataFrame) -> Series: diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index 444c1a2..eab2736 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -195,7 +195,7 @@ def test_different_sampling_methods(file_path, inputs_tair_extended): # output change_tair = x1 - lx1[0] - change_tair.rename('Change(t_air)') + change_tair.rename('Change(T_air)') # Create & process Training data prep = PreprocessingSingleStep(inputs_tair_extended, [change_tair], time_step=4) @@ -209,7 +209,7 @@ def test_different_sampling_methods(file_path, inputs_tair_extended): assert x1.get_sampling_method() == 'previous' and lx1[1].get_sampling_method() == 'previous' assert x2.get_sampling_method() == 'current' and lx2.get_sampling_method() == 'current' assert FeatureConstruction.get_feature('weaSta_reaWeaHDirNor_y').get_sampling_method() == 'mean_over_interval' - assert FeatureConstruction.get_feature('test_feature_two').get_sampling_method() == 'current' + assert FeatureConstruction.get_feature('test_feature_two').get_sampling_method() == 'previous' assert e.get_sampling_method() == 'previous' assert change_tair.get_sampling_method() == '_' @@ -485,6 +485,8 @@ def test_models_rnn(file_path): m = RNNModel(epochs=1, rnn_layer='RNN', early_stopping_epochs=None) m.pipeline(td, save_model=False, plot=False) + FeatureConstruction.reset() + def test_read_setup(): @@ -497,6 +499,7 @@ def test_read_setup(): config_prep = json.load(f) a = PreprocessingData.from_config(config_prep) assert isinstance(a, PreprocessingSingleStep) + FeatureConstruction.reset() save_name_preprocessing = 'preprocessing_config2.json' path = os.path.join(Logger._logger, save_name_preprocessing) @@ -504,12 +507,14 @@ def test_read_setup(): config_prep = json.load(f) b = PreprocessingData.from_config(config_prep) assert isinstance(b, PreprocessingMultiStep) + FeatureConstruction.reset() save_name_constructed = Logger.save_name_constructed path = os.path.join(Logger._logger, save_name_constructed) with open(path, "r") as f: config_constructed = json.load(f) FeatureConstruction.from_config(config_constructed) + FeatureConstruction.reset() save_name_model = Logger.save_name_model_config path = os.path.join(Logger._logger, save_name_model) @@ -570,5 +575,6 @@ def test_feature_selection_multi(monkeypatch, tair_data_delta, tair_data_noval , m = ClassicalANNModel(epochs=1, n_neurons=4) recursive_feature_elimination_pipeline(file_path, prep2, m, use_multi_step_error=False) m.pipeline(td2, save_model=False, plot=False) - Logger.log_setup(prep, None) + Logger.log_setup(prep, None, save_name_preprocessing='preprocessing_feature-selection-multi.json', + save_name_constructed='constructed_config_feature-selection-multi.json') Logger.save_training_data(td, path=os.path.join(Logger._logger, 'training_data2.json')) From 79db1dc81ab9af4cb47aa31e49d5aaa279df0a74 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Thu, 18 Dec 2025 00:58:02 +0100 Subject: [PATCH 37/42] fixed testing bug resetting FeatureConstruction.features also affected p_hp_data --- unittests/test_coverage.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index eab2736..5980116 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -529,12 +529,13 @@ def test_read_setup(): AbstractModel.model_from_config(config_model) -def test_feature_selection(monkeypatch, p_hp_data, file_path): +def test_feature_selection(monkeypatch, inputs_php, output_php, file_path): # Setup up logger for saving Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) monkeypatch.setattr('builtins.input', lambda _: "2") - prep = p_hp_data[0] + # Create Training data + prep = PreprocessingSingleStep(inputs_php, output_php) m = LinearRegressionModel() From c2f002a9af94e61f256c37b65ae572fbdf52f588 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Thu, 18 Dec 2025 17:33:43 +0100 Subject: [PATCH 38/42] Refactoring of sampling, corrected use of UserWarnings --- physXAI/preprocessing/constructed.py | 36 ++-- physXAI/preprocessing/preprocessing.py | 166 ++++--------------- physXAI/preprocessing/sampling.py | 218 +++++++++++++++++++++++++ unittests/test_coverage.py | 9 +- 4 files changed, 268 insertions(+), 161 deletions(-) create mode 100644 physXAI/preprocessing/sampling.py diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index 69c1ed5..08e9616 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -2,25 +2,8 @@ from typing import Type, Union import numpy as np from pandas import DataFrame, Series - - -def _return_valid_sampling_method(v: Union[int, str]): - """ check the validity of the given sampling method and return a string if val is int """ - - if not isinstance(v, (int, str)): - raise TypeError(f'Type of sampling method not supported. Type is {type(v)}, must be int or str.') - - if v in ['current', 0]: - return 'current' - elif v in ['previous', 1]: - return 'previous' - elif v in ['mean_over_interval', '_']: - return v - else: - raise ValueError( - f"Value of sampling method not supported, value is: {v}. Sampling method must be 'current' " - f"(or 0 if sampling_method is int), 'previous' (or 1 if sampling_method is int) or 'mean_over_interval'. " - f"In case of deactivated sampling (for outputs), sampling_method must be '_'.") +import warnings +from physXAI.preprocessing.sampling import _return_valid_sampling_method class FeatureBase(ABC): @@ -234,7 +217,7 @@ def set_default_sampling_method(cls, val: Union[str, int]): Feature._default_sampling_method = _return_valid_sampling_method(val) -def get_sampling_from_base(base_features: Union[FeatureBase, list[FeatureBase]], **kwargs) -> [str, list]: +def get_sampling_from_base_feature(base_features: Union[FeatureBase, list[FeatureBase]], **kwargs) -> [str, list]: """ Returns the appropriate sampling_method for a constructed feature based on its base feature(s) @@ -308,7 +291,7 @@ def __init__(self, f: Union[FeatureBase, str], lag: int, name: str = None, **kwa name = f + f'_lag{lag}' # lags must have the same sampling_method as their base feature - sampling_method, kwargs = get_sampling_from_base(FeatureConstruction.get_feature(self.origf), **kwargs) + sampling_method, kwargs = get_sampling_from_base_feature(FeatureConstruction.get_feature(self.origf), **kwargs) super().__init__(name, sampling_method=sampling_method, **kwargs) self.lag: int = lag @@ -355,7 +338,7 @@ def __init__(self, feature1: Union[FeatureBase, int, float], feature2: Union[Fea name = self.name(f1n, f2n) # constructed features must have the same sampling_method as their base features - sampling_method, kwargs = get_sampling_from_base([feature1, feature2], **kwargs) + sampling_method, kwargs = get_sampling_from_base_feature([feature1, feature2], **kwargs) super().__init__(name, sampling_method=sampling_method, **kwargs) self.feature1 = feature1 self.feature2 = feature2 @@ -535,7 +518,7 @@ def __init__(self, f1: FeatureBase, name: str = None, **kwargs): if name is None: name = 'exp(' + f1.feature + ')' # constructed features must have the same sampling_method as their base features - sampling_method, kwargs = get_sampling_from_base(f1, **kwargs) + sampling_method, kwargs = get_sampling_from_base_feature(f1, **kwargs) super().__init__(name, sampling_method=sampling_method, **kwargs) def process(self, df: DataFrame) -> Series: @@ -567,7 +550,7 @@ def __init__(self, f1: FeatureBase, name: str = None, **kwargs): if name is None: name = 'sin(' + f1.feature + ')' # constructed features must have the same sampling_method as their base features - sampling_method, kwargs = get_sampling_from_base(f1, **kwargs) + sampling_method, kwargs = get_sampling_from_base_feature(f1, **kwargs) super().__init__(name, sampling_method=sampling_method, **kwargs) def process(self, df: DataFrame) -> Series: @@ -599,7 +582,7 @@ def __init__(self, f1: FeatureBase, name: str = None, **kwargs): if name is None: name = 'cos(' + f1.feature + ')' # constructed features must have the same sampling_method as their base features - sampling_method, kwargs = get_sampling_from_base(f1, **kwargs) + sampling_method, kwargs = get_sampling_from_base_feature(f1, **kwargs) super().__init__(name, sampling_method=sampling_method, **kwargs) def process(self, df: DataFrame) -> Series: @@ -631,7 +614,8 @@ class FeatureConstant(FeatureBase): def __init__(self, c: float, name: str, **kwargs): self.c = c if 'sampling_method' in kwargs.keys(): - UserWarning(f"Using 'sampling_method' for {self.__class__} does not have any effect.") + warnings.warn(f"Using 'sampling_method' for {self.__class__.__name__} does not have any effect.", + UserWarning) super().__init__(name, **kwargs) def process(self, df: DataFrame) -> Series: diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index 592bb0f..958ea0c 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -1,12 +1,13 @@ import os from abc import ABC, abstractmethod -from typing import Optional, Union, Iterable +from typing import Optional, Union import numpy as np import pandas as pd -import itertools +import warnings from sklearn.model_selection import train_test_split from physXAI.preprocessing.constructed import FeatureConstruction, FeatureBase, Feature, FeatureTwo from physXAI.preprocessing.training_data import TrainingData, TrainingDataMultiStep, TrainingDataGeneric +from physXAI.preprocessing.sampling import Sampling from physXAI.utils.logging import get_full_path os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' import keras @@ -94,11 +95,6 @@ def load_data(self, file_path: str) -> pd.DataFrame: return df - def sample_df_according_to_timestep(self, df: pd.DataFrame): - filtering = (df.index - df.index[0]) % self.time_step == 0 - df = df[filtering] - return df - @abstractmethod def pipeline(self, file_path: str) -> TrainingDataGeneric: """ @@ -165,16 +161,14 @@ def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, lis """ if 'shift' in kwargs.keys(): - DeprecationWarning( - "shift parameter is deprecated for SingleStep models and replaced by sampling_method, an attribute of " - "each Feature. This allows specifying individual 'shifts' for each Feature / input. A default sampling" - "method can be specified via Feature.set_default_sampling_method()." - ) - DeprecationWarning( - f"shift parameter was given as shift={kwargs['shift']}. Setting Feature.set_default_sampling_method" - f"(shift) and overriding possible individual sampling methods of all Features. If this is" - f"not intended, remove shift parameter when initializing PreprocessingSingleStep object!" - ) + warnings.warn("shift parameter is deprecated for SingleStep models and replaced by sampling_method," + "an attribute of each Feature. This allows specifying individual 'shifts' for each Feature / " + "input. A default sampling method can be specified via " + "Feature.set_default_sampling_method().", DeprecationWarning) + warnings.warn(f"shift parameter was given as shift={kwargs['shift']}. Setting" + f"Feature.set_default_sampling_method(shift) and overriding possible individual sampling " + f"methods of all Features. If this is not intended, remove shift parameter when initializing" + f" PreprocessingSingleStep object!", DeprecationWarning) Feature.set_default_sampling_method(kwargs['shift']) for f in FeatureConstruction.features: f.set_sampling_method(kwargs['shift']) @@ -185,10 +179,11 @@ def __init__(self, inputs: list[Union[str, FeatureBase]], output: Union[str, lis def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """ Processes the loaded DataFrame: - 1. Applies feature constructions defined in `FeatureConstruction`. - 2. Selects relevant input and output columns. - 3. Handles missing values by dropping rows. - 4. Applies the defined sampling method on each input variable. + + 1. Selects relevant input and output columns. + 2. Handles missing values by dropping rows. + 3. Applies the defined sampling method on each (unconstructed) input variable. + 4. Applies feature constructions defined in `FeatureConstruction`. Args: df (pd.DataFrame): The input DataFrame. @@ -199,15 +194,13 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: """ # extract the names of all constructed features - constructed_input_features = FeatureConstruction.get_constructed_features(self.inputs) - constructed_output_features = FeatureConstruction.get_constructed_features(self.output) + constructed_inputs = FeatureConstruction.get_constructed_features(self.inputs) + constructed_outputs = FeatureConstruction.get_constructed_features(self.output) # Only apply sampling method to those features which are not constructed features - # but which data is taken directly from the data frame - inputs_without_constructed = [inp for inp in self.inputs if inp not in constructed_input_features] - output_without_constructed = [out for out in self.output if out not in constructed_output_features] - - features_without_constructed: list[FeatureBase] = [FeatureConstruction.get_feature(inp) for inp in inputs_without_constructed] + # but whose data is taken directly from the data frame + inputs_without_constructed = [inp for inp in self.inputs if inp not in constructed_inputs] + output_without_constructed = [out for out in self.output if out not in constructed_outputs] df = df[inputs_without_constructed + output_without_constructed] @@ -217,95 +210,25 @@ def process_data(self, df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: last_valid_index = non_nan_rows.iloc[::-1].idxmax() if non_nan_rows.any() else None df = df.loc[first_valid_index:last_valid_index] - # sample input data; different inputs can have different sampling methods - res = [] - previous_or_mean_in_sampling_methods = [] - X = df[inputs_without_constructed].copy() - target_grid = self.sample_df_according_to_timestep(df).index - for f in features_without_constructed: - # only process inputs with sampling method mean_over_interval first since X cannot be sampled - # to the actual required time steps until the intermediate values were taken into the mean - if f.get_sampling_method() == 'mean_over_interval': - res.append(get_mean_over_interval(X[[f.feature]], target_grid)) - previous_or_mean_in_sampling_methods.append(True) - - # sample X according to required time step - X = self.sample_df_according_to_timestep(X) - # process inputs with sampling methods 'current' and 'previous' - for f in features_without_constructed: - _x = X[[f.feature]] - if f.get_sampling_method() == 'current': - # no transformation needed - res.append(_x) - previous_or_mean_in_sampling_methods.append(False) - elif f.get_sampling_method() == 'previous': - # shift by 1 - _x = _x.shift(1) - _x = _x.iloc[1:] - res.append(_x) - previous_or_mean_in_sampling_methods.append(True) - elif f.get_sampling_method() == 'mean_over_interval': - continue - else: - raise NotImplementedError(f"Sampling method '{f.get_sampling_method()}' not implemented.") - # concatenate sampled input data - X = pd.concat(res, axis=1) - X = X.sort_index(ascending=True) - - # Sampling methods 'previous' and 'mean_over_interval' reduce available data points by 1. - if any(previous_or_mean_in_sampling_methods): - # if at least one of the features uses 'current' as sampling method, shorten X - if not all(previous_or_mean_in_sampling_methods): - X = X.iloc[1:] - - if X.isnull().values.any(): - if self.ignore_nan: - X.dropna(inplace=True) - else: - raise ValueError( - "Data Error: The TrainingData contains NaN values in intermediate rows. If this is intended, set " - "ignore_nan=True in PreprocessingSingleStep.") + sampler = Sampling(inputs_without_constructed, output_without_constructed, self.time_step, self.ignore_nan) + # sample input data + X = sampler.sample_unconstructed_inputs(df) # sample output data if len(output_without_constructed) != 0: # at least one non-constructed output feature - y = df[output_without_constructed].copy() - y = self.sample_df_according_to_timestep(y) - # Sampling methods 'previous' and 'mean_over_interval' reduce available data points by 1. - # synchronize length of X and y - if any(previous_or_mean_in_sampling_methods): - y = y.iloc[1:] - if y.isnull().values.any(): - if self.ignore_nan: - y.dropna(inplace=True) - else: - raise ValueError( - "Data Error: The TrainingData contains NaN values in intermediate rows. If this is intended," - "set ignore_nan=True in PreprocessingSingleStep.") - + y = sampler.sample_unconstructed_outputs(df) res_df = pd.concat([X, y], axis=1) - else: # only constructed outputs res_df = X # Applies feature constructions defined in `FeatureConstruction` - FeatureConstruction.process(res_df, feature_names=constructed_input_features + constructed_output_features) - - # assume constructed outputs solely base on features with sampling current or sampling previous / mean_over_interval - - if any(previous_or_mean_in_sampling_methods): - methods = ['previous', 'mean_over_interval'] - # if constructed output features are based on input features with sampling previous or mean_over_interval, - # the constructed output has to be shifted to invert the shift of the input features - for out in constructed_output_features: - out_feature = FeatureConstruction.get_feature(out) - if isinstance(out_feature, FeatureTwo): - if out_feature.feature1.get_sampling_method() in methods or out_feature.feature2.get_sampling_method() in methods: - res_df[out_feature.feature] = res_df[out_feature.feature].shift(-1) # shift - else: # constructed feature that doesn't consist of two features - if out_feature.f1.get_sampling_method() in methods: - res_df[out_feature.feature] = res_df[out_feature.feature].shift(-1) # shift - - # drop NaNs occurring due to creation of lags + FeatureConstruction.process(res_df, feature_names=constructed_inputs + constructed_outputs) + + if len(constructed_outputs) != 0: + # correct shifting of constructed outputs if any + res_df = sampler.sample_constructed_outputs(res_df, constructed_outputs) + + # drop NaNs occurring due to creation of lags (constructed feature) res_df.dropna(inplace=True) X = res_df[self.inputs] @@ -379,28 +302,6 @@ def from_config(cls, config: dict) -> 'PreprocessingSingleStep': return cls(**config) -def get_mean_over_interval(x: pd.DataFrame, target_grid: pd.DataFrame.index) -> pd.DataFrame: - """samples and returns x on target grid taking the mean over the interval (between the grid indices)""" - def pairwise(iterable: Iterable): - "s -> (s0,s1), (s1,s2), (s2, s3), ..." - a, b = itertools.tee(iterable) - next(b, None) - return zip(a, b) - - original_grid = np.array(x.index) - results = [] - for i, j in pairwise(target_grid): - slicer = np.logical_and(original_grid >= i, original_grid < j) - d = {'Index': j} - for inp in x.columns: - d[inp] = x[inp][slicer].mean() - results.append(d) - - x = pd.DataFrame(results).set_index('Index') - - return x - - class PreprocessingMultiStep(PreprocessingData): """ Handles preprocessing for multi-step forecasting models, typically RNNs. @@ -492,7 +393,8 @@ def process_data(self, df: pd.DataFrame) -> TrainingDataMultiStep: """ # filter data - df = self.sample_df_according_to_timestep(df) + sampler = Sampling(unconstructed_inputs=[], unconstructed_outputs=[], time_step=self.time_step) + df = sampler.sample_df_according_to_timestep(df) # Applies feature constructions defined in `FeatureConstruction`. FeatureConstruction.process(df) diff --git a/physXAI/preprocessing/sampling.py b/physXAI/preprocessing/sampling.py new file mode 100644 index 0000000..9d34196 --- /dev/null +++ b/physXAI/preprocessing/sampling.py @@ -0,0 +1,218 @@ +from typing import Union, Iterable +import pandas as pd +import numpy as np +import itertools + + +def _return_valid_sampling_method(v: Union[int, str]): + """ check the validity of the given sampling method and return a string if value is int """ + + if not isinstance(v, (int, str)): + raise TypeError(f'Type of sampling method not supported. Type is {type(v)}, must be int or str.') + + if v in ['current', 0]: + return 'current' + elif v in ['previous', 1]: + return 'previous' + elif v in ['mean_over_interval', '_']: + return v + else: + raise ValueError( + f"Value of sampling method not supported, value is: {v}. Sampling method must be 'current' " + f"(or 0 if sampling_method is int), 'previous' (or 1 if sampling_method is int) or 'mean_over_interval'. " + f"In case of deactivated sampling (for outputs), sampling_method must be '_'.") + + +class Sampling: + def __init__(self, unconstructed_inputs: list[str], unconstructed_outputs: list[str], time_step: Union[int, float], + ignore_nan: bool = False): + """ + A class providing methods for sampling + + Args: + unconstructed_inputs (list[str]): names of unconstructed (!) input features + unconstructed_outputs (list[str]): names of unconstructed (!) output features + time_step (Union[int, float]): sampling interval, multiple of sampling of data + ignore_nan: If True, intermediate rows with NaN values will be dropped. + If False, an error is raised if NaNs are present in intermediate rows after processing. + Default is False. + """ + self.inputs = unconstructed_inputs + self.outputs = unconstructed_outputs + self.time_step = time_step + self.ignore_nan = ignore_nan + + def sample_df_according_to_timestep(self, df: pd.DataFrame) -> pd.DataFrame: + """ + samples given data frame to the new grid defined by time_step + + Args: + df: pandas DataFrame + Returns: + pd.DataFrame: DataFrame with the new sampling grid + """ + filtering = (df.index - df.index[0]) % self.time_step == 0 + df = df[filtering] + return df + + def previous_or_mean_in_sampling_methods(self) -> list[bool]: + """ + checks if any input uses the sampling methods 'previous' or 'mean_over_interval' + + Returns: + list[bool]: list of bool stating if the sampling method of an input is prev./mean (True) or not (False) + (list in the order of self.inputs) + """ + # no import on module level possible due to circular import + from physXAI.preprocessing.preprocessing import FeatureConstruction + + arr = [] + for fn in self.inputs: + sm = FeatureConstruction.get_feature(fn).get_sampling_method() + arr.append(sm in ['previous', 'mean_over_interval']) + return arr + + def sample_unconstructed_inputs(self, df: pd.DataFrame) -> pd.DataFrame: + """ + extracts the unconstructed inputs from the given DataFrame, applies their corresponding sampling method and + samples them to the target grid + + Args: + df (pd.DataFrame): data + Returns: + pd.DataFrame: DataFrame (X) that solely contains all unconstructed inputs (with the correct sampling) + """ + + # no import on module level possible due to circular import + from physXAI.preprocessing.preprocessing import FeatureConstruction + + # extract inputs from DataFrame and get target sampling grid + X = df[self.inputs].copy() + target_grid = self.sample_df_according_to_timestep(df).index + + # different inputs can have different sampling methods + res = [] + features_without_constructed = [FeatureConstruction.get_feature(inp) for inp in self.inputs] + for f in features_without_constructed: + # only process inputs with sampling method mean_over_interval first since X cannot be sampled + # to the actual required time steps until the intermediate values were taken into the mean + if f.get_sampling_method() == 'mean_over_interval': + res.append(get_mean_over_interval(X[[f.feature]], target_grid)) + + # sample X to target grid + X = self.sample_df_according_to_timestep(X) + # process inputs with sampling methods 'current' and 'previous' + for f in features_without_constructed: + _x = X[[f.feature]] + if f.get_sampling_method() == 'current': + # no transformation needed + res.append(_x) + elif f.get_sampling_method() == 'previous': + # shift by 1 + _x = _x.shift(1) + _x = _x.iloc[1:] + res.append(_x) + elif f.get_sampling_method() == 'mean_over_interval': + continue + else: + raise NotImplementedError(f"Sampling method '{f.get_sampling_method()}' not implemented.") + # concatenate sampled input data + X = pd.concat(res, axis=1) + X = X.sort_index(ascending=True) + + # Sampling methods 'previous' and 'mean_over_interval' reduce available data points by 1. + previous_or_mean = self.previous_or_mean_in_sampling_methods() + if any(previous_or_mean): + # if at least one of the features uses 'current' as sampling method, shorten X + if not all(previous_or_mean): + X = X.iloc[1:] + + # check for NaNs + if X.isnull().values.any(): + if self.ignore_nan: + X.dropna(inplace=True) + else: + raise ValueError( + "Data Error: The input data contains NaN values in intermediate rows. If this is intended, set " + "ignore_nan=True in PreprocessingSingleStep.") + return X + + def sample_unconstructed_outputs(self, df: pd.DataFrame) -> pd.DataFrame: + """ + extracts the unconstructed outputs from the given DataFrame and samples them to the target grid + + Args: + df (pd.DataFrame): data + Returns: + pd.DataFrame: DataFrame (y) that solely contains all unconstructed outputs + """ + y = df[self.outputs].copy() + y = self.sample_df_according_to_timestep(y) + + # Sampling methods 'previous' and 'mean_over_interval' reduce available data points by 1. + # synchronize length of X and y + if any(self.previous_or_mean_in_sampling_methods()): + y = y.iloc[1:] + + # check for NaNs + if y.isnull().values.any(): + if self.ignore_nan: + y.dropna(inplace=True) + else: + raise ValueError( + "Data Error: The output data contains NaN values in intermediate rows. If this is intended," + "set ignore_nan=True in PreprocessingSingleStep.") + return y + + def sample_constructed_outputs(self, df: pd.DataFrame, constructed_outputs: list[str]) -> pd.DataFrame: + """ + Correct shifting of constructed outputs if they are based on input features with sampling previous or mean_over_interval. + Since the inputs are shifted before the constructed features are created, the constructed output has to be + shifted to invert / neutralize the shift of the input features that was applied before. + + Args: + df (pd.DataFrame): data including constructed features + constructed_outputs (list[str]): names of constructed output features + Returns: + pd.DataFrame: modified DataFrame (df) + """ + # no import on module level possible due to circular import + from physXAI.preprocessing.preprocessing import FeatureConstruction, FeatureTwo + + if any(self.previous_or_mean_in_sampling_methods()): + methods = ['previous', 'mean_over_interval'] + for out in constructed_outputs: + out_feature = FeatureConstruction.get_feature(out) + if isinstance(out_feature, FeatureTwo): + # correct shifting only if output bases on input features with before mentioned sampling methods + if (out_feature.feature1.get_sampling_method() in methods or + out_feature.feature2.get_sampling_method() in methods): + df[out_feature.feature] = df[out_feature.feature].shift(-1) + else: # constructed feature that doesn't consist of two features (FeatureExp, ...) + # correct shifting only if output bases on input features with before mentioned sampling methods + if out_feature.f1.get_sampling_method() in methods: + df[out_feature.feature] = df[out_feature.feature].shift(-1) + return df + + +def get_mean_over_interval(x: pd.DataFrame, target_grid: pd.DataFrame.index) -> pd.DataFrame: + """samples and returns x on target grid taking the mean over the interval (between the grid indices)""" + + def pairwise(iterable: Iterable): + "s -> (s0,s1), (s1,s2), (s2, s3), ..." + a, b = itertools.tee(iterable) + next(b, None) + return zip(a, b) + + original_grid = np.array(x.index) + results = [] + for i, j in pairwise(target_grid): + slicer = np.logical_and(original_grid >= i, original_grid < j) + d = {'Index': j} + for inp in x.columns: + d[inp] = x[inp][slicer].mean() + results.append(d) + + x = pd.DataFrame(results).set_index('Index') + + return x diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index 5980116..e3bd9b5 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -213,6 +213,8 @@ def test_different_sampling_methods(file_path, inputs_tair_extended): assert e.get_sampling_method() == 'previous' assert change_tair.get_sampling_method() == '_' + with pytest.warns(UserWarning): + c = FeatureConstant(c=100, name='test_const', sampling_method=1) FeatureConstruction.reset() @@ -313,7 +315,8 @@ def test_deprecated_shift(p_hp_data, inputs_php, output_php, file_path): Logger.setup_logger(base_path=base_path, folder_name='unittests\\test_coverage', override=True) # Create & process Training data - prep = PreprocessingSingleStep(inputs_php, output_php, shift=0) # deprecated shift given in preprocessing + with pytest.warns(DeprecationWarning): + prep = PreprocessingSingleStep(inputs_php, output_php, shift=0) # deprecated shift given in preprocessing td = prep.pipeline(file_path) m = ClassicalANNModel(epochs=1, n_neurons=[4, 4], n_layers=2, activation_function=['softplus', 'softplus'], @@ -344,8 +347,8 @@ def test_deprecated_shift(p_hp_data, inputs_php, output_php, file_path): "random_state": 42, "time_step": 1.0, } - - a = PreprocessingData.from_config(config_prep) + with pytest.warns(DeprecationWarning): + a = PreprocessingData.from_config(config_prep) assert isinstance(a, PreprocessingSingleStep) assert Feature.get_default_sampling_method() == 'current' From 3cbcdba2ecc3251219c191aff9ba6903388c215e Mon Sep 17 00:00:00 2001 From: Patrick Henkel Date: Mon, 22 Dec 2025 17:25:26 +0100 Subject: [PATCH 39/42] Updated --- physXAI/preprocessing/constructed.py | 14 +++++++------- physXAI/preprocessing/sampling.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index 08e9616..d883275 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -1,9 +1,9 @@ from abc import ABC, abstractmethod -from typing import Type, Union +from typing import Optional, Type, Union import numpy as np from pandas import DataFrame, Series import warnings -from physXAI.preprocessing.sampling import _return_valid_sampling_method +from physXAI.preprocessing.sampling import return_valid_sampling_method class FeatureBase(ABC): @@ -13,13 +13,13 @@ class FeatureBase(ABC): in a Pandas DataFrame. It supports arithmetic operations to combine features. """ - def __init__(self, name: str, sampling_method: Union[str, int] = None, **kwargs): + def __init__(self, name: str, sampling_method: Optional[Union[str, int]] = None, **kwargs): """ Initializes a FeatureBase instance. Args: name (str): The name of the feature. This will be the column name in the DataFrame. - sampling_method (Union[str, int]): Time step of the input data used to predict the output. + sampling_method (Optional[Union[str, int]]): Time step of the input data used to predict the output. - if None: Feature._default_sampling_method is used - if 'current' or 0: Current time step will be used for prediction. - if 'previous' or 1: Previous time step will be used for prediction. @@ -50,7 +50,7 @@ def set_sampling_method(self, val: Union[str, int] = None): if val is None: self._sampling_method = Feature.get_default_sampling_method() else: - self._sampling_method = _return_valid_sampling_method(val) + self._sampling_method = return_valid_sampling_method(val) def rename(self, name: str): """ @@ -214,7 +214,7 @@ def set_default_sampling_method(cls, val: Union[str, int]): - 'previous' or 1: Previous time step will be used for prediction. - 'mean_over_interval': Mean between current and previous time step will be used. """ - Feature._default_sampling_method = _return_valid_sampling_method(val) + Feature._default_sampling_method = return_valid_sampling_method(val) def get_sampling_from_base_feature(base_features: Union[FeatureBase, list[FeatureBase]], **kwargs) -> [str, list]: @@ -254,7 +254,7 @@ def get_sampling_from_base_feature(base_features: Union[FeatureBase, list[Featur # necessary for feature construction from config sampling_method = '_' else: - assert _return_valid_sampling_method(kwargs['sampling_method']) == sampling_method, ( + assert return_valid_sampling_method(kwargs['sampling_method']) == sampling_method, ( f"Constructed features must have the same sampling method as their base feature(s). Sampling method of " f"base feature(s) is {sampling_method} but {kwargs['sampling_method']} was given as sampling method." ) diff --git a/physXAI/preprocessing/sampling.py b/physXAI/preprocessing/sampling.py index 9d34196..4426c42 100644 --- a/physXAI/preprocessing/sampling.py +++ b/physXAI/preprocessing/sampling.py @@ -4,7 +4,7 @@ import itertools -def _return_valid_sampling_method(v: Union[int, str]): +def return_valid_sampling_method(v: Union[int, str]): """ check the validity of the given sampling method and return a string if value is int """ if not isinstance(v, (int, str)): From 1af0fb4aa1867c401e33ba19861f077901e490f8 Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 24 Dec 2025 13:08:33 +0100 Subject: [PATCH 40/42] Updated --- .../different_sampling_methods.py | 2 +- physXAI/preprocessing/constructed.py | 30 ++++++++++++++----- unittests/test_coverage.py | 10 +++++-- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/executables/bestest_hydronic_heat_pump/different_sampling_methods.py b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py index 4048da2..37205eb 100644 --- a/executables/bestest_hydronic_heat_pump/different_sampling_methods.py +++ b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py @@ -53,7 +53,7 @@ y = x1 + lx1[0] z = y + x1 z.rename('example_feature_two') # since z is a constructed feature based on x1, its sampling_method will be previous -e = FeatureExp(x1-273.15, 'exp', sampling_method=1) # reduce x1 by 273.15, otherwise values are too high +e = FeatureExp(x1-273.15, 'exp') # reduce x1 by 273.15, otherwise values are too high inputs.extend([z, e]) # add dummy features to inputs # construct output diff --git a/physXAI/preprocessing/constructed.py b/physXAI/preprocessing/constructed.py index d883275..945eea9 100644 --- a/physXAI/preprocessing/constructed.py +++ b/physXAI/preprocessing/constructed.py @@ -219,7 +219,10 @@ def set_default_sampling_method(cls, val: Union[str, int]): def get_sampling_from_base_feature(base_features: Union[FeatureBase, list[FeatureBase]], **kwargs) -> [str, list]: """ - Returns the appropriate sampling_method for a constructed feature based on its base feature(s) + Returns the appropriate sampling_method for a constructed feature based on its base feature(s). A constructed + feature must be built from features with sampling methods that apply the same time shift. Therefore, constructed + features can either base on features which have solely the sampling method 'current' (no time shift applied) or on + features which have one of the sampling methods ['previous','mean_over_interval'] (time shift of one unit applied). Args: base_features (Union[FeatureBase, list[FeatureBase]]): single base feature or list of max. two base features @@ -244,20 +247,31 @@ def get_sampling_from_base_feature(base_features: Union[FeatureBase, list[Featur else: raise ValueError(f"Expected type [FeatureBase, int, float], got type {type(f)} instead") - if len(sampling) > 1: - assert len(set(sampling)) == 1, f'Sampling methods of base feature are not equal, got {sampling}' + sampling = list(set(sampling)) - sampling_method = sampling[0] + if len(sampling) == 1: + sampling_method = sampling[0] + else: + if 'current' in sampling: # 'current' together with other sampling methods + raise ValueError(f"Sampling method(s) of base feature(s) are not equal 'current', got sampling method(s): {sampling}") + else: # 'previous' together with 'mean_over_interval' + sampling_method = 'mean_over_interval' if 'sampling_method' in kwargs.keys(): if 'ignore_sampling_for_output' in kwargs.keys() and kwargs['ignore_sampling_for_output']: # necessary for feature construction from config sampling_method = '_' else: - assert return_valid_sampling_method(kwargs['sampling_method']) == sampling_method, ( - f"Constructed features must have the same sampling method as their base feature(s). Sampling method of " - f"base feature(s) is {sampling_method} but {kwargs['sampling_method']} was given as sampling method." - ) + message = (f"Constructed features must be built from features with sampling methods that apply the same " + f"time shift. Therefore, constructed features can either base on features which have solely the " + f"sampling method 'current' (no time shift applied) or on features which have one of the sampling" + f" methods ['previous','mean_over_interval'] (time shift of one unit applied).\n" + f"Sampling method of base feature(s) is '{sampling_method}' but in kwargs " + f"'{return_valid_sampling_method(kwargs['sampling_method'])}' was given as sampling method.") + if sampling_method == 'current': + assert return_valid_sampling_method(kwargs['sampling_method']) == sampling_method, message + else: + assert return_valid_sampling_method(kwargs['sampling_method']) in ['previous', 'mean_over_interval'], message kwargs.__delitem__('sampling_method') # constructor must not get more than one arg with the same key return sampling_method, kwargs diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index e3bd9b5..7d409fd 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -127,9 +127,15 @@ def test_unsupported_type(self): # test case: lags of the same input have mismatching sampling methods def test_lag_with_mismatching_sampling_methods(self): - x = Feature('test', sampling_method='current') + # allowed + x = Feature('test_correct', sampling_method='mean_over_interval') + x2 = FeatureLag(x, lag=2, sampling_method='mean_over_interval') + e = FeatureExp(x, sampling_method='previous') + + # not allowed + y = Feature('test_fault', sampling_method='current') with self.assertRaises(AssertionError): - FeatureLag(x, lag=1, sampling_method='previous') + FeatureLag(y, lag=1, sampling_method='previous') FeatureConstruction.reset() From bcef26ed33490804c502934b664e4b51d721578a Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Wed, 24 Dec 2025 13:30:50 +0100 Subject: [PATCH 41/42] Updated --- .../different_sampling_methods.py | 16 +++++++++++++--- unittests/test_coverage.py | 15 +++++++++++---- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/executables/bestest_hydronic_heat_pump/different_sampling_methods.py b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py index 37205eb..77816d2 100644 --- a/executables/bestest_hydronic_heat_pump/different_sampling_methods.py +++ b/executables/bestest_hydronic_heat_pump/different_sampling_methods.py @@ -7,7 +7,7 @@ This script demonstrates the usage of different sampling methods. It is not physically meaningful. When creating a Feature, a sampling method can be specified. -For constructed features, no sampling method is necessary. It is assigned based on their corresponding base feature(s) +For constructed features, no sampling method is necessary. It is assigned based on their corresponding base feature(s). sampling_method (Union[str, int]): Time step of the input data used to predict the output. - if None: Feature.get_default_sampling_method() is used @@ -17,7 +17,12 @@ Specify default sampling method using Feature.set_default_sampling_method(). If no default sampling method is specified by the user, 'previous' is used as default. + +Constructed features must be built from features with sampling methods that apply the same time shift. Therefore, +constructed features can either base on features which have solely the sampling method 'current' (no time shift applied) +or on features which have one of the sampling methods ['previous','mean_over_interval'] (time shift of one unit applied). """ + Feature.set_default_sampling_method(0) # Setup up logger for saving @@ -46,7 +51,7 @@ lx2 = x2.lag(1) # create lag of oveHeaPumY_u: oveHeaPumY_u_lag1, oveHeaPumY_u_lag2 -x3 = Feature('oveHeaPumY_u') +x3 = Feature('oveHeaPumY_u', sampling_method='mean_over_interval') x3.lag(2) # dummy Features @@ -54,7 +59,12 @@ z = y + x1 z.rename('example_feature_two') # since z is a constructed feature based on x1, its sampling_method will be previous e = FeatureExp(x1-273.15, 'exp') # reduce x1 by 273.15, otherwise values are too high -inputs.extend([z, e]) # add dummy features to inputs + +# x1 and x3 have sampling methods 'previous' and 'mean_over_interval'. +# Since both of them apply a time shift of one, they can be combined in constructed features +a = x1 + x3 + +inputs.extend([z, e, a]) # add dummy features to inputs # construct output change_tair = x1 - lx1[0] diff --git a/unittests/test_coverage.py b/unittests/test_coverage.py index 7d409fd..de712b0 100644 --- a/unittests/test_coverage.py +++ b/unittests/test_coverage.py @@ -132,6 +132,9 @@ def test_lag_with_mismatching_sampling_methods(self): x2 = FeatureLag(x, lag=2, sampling_method='mean_over_interval') e = FeatureExp(x, sampling_method='previous') + with pytest.warns(UserWarning): + c = FeatureConstant(c=100, name='test_const', sampling_method=1) + # not allowed y = Feature('test_fault', sampling_method='current') with self.assertRaises(AssertionError): @@ -188,7 +191,7 @@ def test_different_sampling_methods(file_path, inputs_tair_extended): lx1 = x1.lag(2) # reaTZon_y_lag1, reaTZon_y_lag2 x2 = Feature('weaSta_reaWeaTDryBul_y') lx2 = x2.lag(1) # weaSta_reaWeaTDryBul_y_lag1 - x3 = Feature('oveHeaPumY_u') + x3 = Feature('oveHeaPumY_u', sampling_method='mean_over_interval') x3.lag(2) # oveHeaPumY_u_lag1, oveHeaPumY_u_lag2 # dummy Features @@ -197,7 +200,12 @@ def test_different_sampling_methods(file_path, inputs_tair_extended): z.rename('test_feature_two') e = FeatureExp(x1 - 273.15, 'exp', sampling_method=1) # reduce x1 by 273.15, otherwise values are too high - inputs_tair_extended.extend([z, e]) + # x1 and x3 have sampling methods 'previous' and 'mean_over_interval'. + # Since both of them apply a time shift of one, they can be combined in constructed features + a = x1 + x3 + a.rename('test_add') + + inputs_tair_extended.extend([z, e, a]) # output change_tair = x1 - lx1[0] @@ -216,11 +224,10 @@ def test_different_sampling_methods(file_path, inputs_tair_extended): assert x2.get_sampling_method() == 'current' and lx2.get_sampling_method() == 'current' assert FeatureConstruction.get_feature('weaSta_reaWeaHDirNor_y').get_sampling_method() == 'mean_over_interval' assert FeatureConstruction.get_feature('test_feature_two').get_sampling_method() == 'previous' + assert FeatureConstruction.get_feature('test_add').get_sampling_method() == 'mean_over_interval' assert e.get_sampling_method() == 'previous' assert change_tair.get_sampling_method() == '_' - with pytest.warns(UserWarning): - c = FeatureConstant(c=100, name='test_const', sampling_method=1) FeatureConstruction.reset() From 095516eff9d580d0d2d13a9fd4ca0527362e286a Mon Sep 17 00:00:00 2001 From: "ross.simon" Date: Fri, 2 Jan 2026 17:45:40 +0100 Subject: [PATCH 42/42] corrected usage of input list --- physXAI/preprocessing/preprocessing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/physXAI/preprocessing/preprocessing.py b/physXAI/preprocessing/preprocessing.py index 958ea0c..7e5d613 100644 --- a/physXAI/preprocessing/preprocessing.py +++ b/physXAI/preprocessing/preprocessing.py @@ -355,11 +355,11 @@ def __init__(self, inputs: list[str], output: Union[str, list[str]], label_width keras.utils.set_random_seed(random_state) # Determine necessary parameters for window creation - self.features: list[str] = (inputs + self.output + - [f for f in self.init_features if f not in inputs and f not in self.output]) + self.features: list[str] = (self.inputs + self.output + + [f for f in self.init_features if f not in self.inputs and f not in self.output]) self.column_indices: dict[str, int] = {name: i for i, name in enumerate(self.features)} - self.warmup_columns_input: list[str] = list(set(self.init_features) & set(inputs)) - self.warmup_columns_labels: list[str] = list(set(self.init_features) - set(inputs)) + self.warmup_columns_input: list[str] = list(set(self.init_features) & set(self.inputs)) + self.warmup_columns_labels: list[str] = list(set(self.init_features) - set(self.inputs)) self.label_width: int = label_width self.warmup_width: int = warmup_width