Merge pull request #110 from alliander-opensource/features-KTPS-1368-…

…implement-minimal-sklearn Features ktps 1368 implement minimal sklearn
OpenSTEF · Jul 1, 2021 · 6fbf4f2 · 6fbf4f2
2 parents 19f0ce0 + 5ed51d4
commit 6fbf4f2
Show file tree

Hide file tree

Showing 136 changed files with 17,331 additions and 4,668 deletions.
diff --git a/.github/workflows/docs-check.yaml b/.github/workflows/docs-check.yaml
@@ -27,6 +27,6 @@ jobs:
       with:
         pre-build-command: |
           cp requirements.txt docs/requirements.txt
-          echo "sphinx_rtd_theme" >> docs/requirements.txt
+          echo -e "\nsphinx_rtd_theme" >> docs/requirements.txt
           sphinx-apidoc -o docs openstf
         docs-folder: "docs/"
diff --git a/.github/workflows/docs-publish.yaml b/.github/workflows/docs-publish.yaml
@@ -21,7 +21,7 @@ jobs:
       with:
         pre-build-command: |
           cp requirements.txt docs/requirements.txt
-          echo "sphinx_rtd_theme" >> docs/requirements.txt
+          echo -e "\nsphinx_rtd_theme" >> docs/requirements.txt
           sphinx-apidoc -o docs openstf
         docs-folder: "docs/"
     # Upload artifact so it is available from the action-window

diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,4 @@ dmypy.json
 
 # Cython debug symbols
 cython_debug/
+/test/reports/
diff --git a/openstf/__init__.py b/openstf/__init__.py
@@ -3,5 +3,12 @@
 # SPDX-License-Identifier: MPL-2.0
 
 from pathlib import Path
+from importlib.metadata import version, PackageNotFoundError
 
 PROJECT_ROOT = Path(__file__).parent.parent.absolute()
+
+try:
+    __version__ = version("openstf")
+except PackageNotFoundError:
+    # package is not installed
+    pass
diff --git a/openstf/enums.py b/openstf/enums.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2017-2021 Alliander N.V. <korte.termijn.prognoses@alliander.com> # noqa E501>
 #
 # SPDX-License-Identifier: MPL-2.0
-from enum import Enum
+from enum import Enum, auto
 
 
 # TODO replace this with ModelType (MLModelType == Machine Learning model type)
@@ -16,3 +16,9 @@ class ForecastType(Enum):
     WIND = "wind"
     SOLAR = "solar"
     BASECASE = "basecase"
+
+
+class TracyJobResult(Enum):
+    SUCCESS = "success"
+    FAILED = "failed"
+    UNKNOWN = "unknown"
diff --git a/openstf/feature_engineering/apply_features.py b/openstf/feature_engineering/apply_features.py
@@ -13,6 +13,8 @@
     The normalised wind power according to the turbine-specific power curve
 
 """
+from typing import List
+
 import pandas as pd
 
 from openstf.feature_engineering.holiday_features import (
@@ -26,19 +28,23 @@
 
 
 def apply_features(
-    data: pd.DataFrame, features: list = None, horizon: float = 24.0
+    data: pd.DataFrame, feature_names: List[str] = None, horizon: float = 24.0
 ) -> pd.DataFrame:
     """This script applies the feature functions defined in
         feature_functions.py and returns the complete dataframe. Features requiring
         more recent label-data are omitted.
 
+        NOTE: For the time deriven features only the onces in the features list
+        will be added. But for the weather features all will be added at present.
+        These unrequested additional features have to be filtered out later.
+
     Args:
         data (pandas.DataFrame): a pandas dataframe with input data in the form:
                                     pd.DataFrame(
                                         index=datetime,
                                         columns=[label, predictor_1,..., predictor_n]
                                     )
-        features (list of strs): list of reuqested features
+        feature_names (List[str]): list of reuqested features
         horizon (float): Forecast horizon limit in hours.
 
     Returns:
@@ -56,9 +62,8 @@ def apply_features(
                             np.random.uniform(0.7,1.7, 200)))
 
     """
-
     # Get lag feature functions
-    feature_functions = generate_lag_feature_functions(features, horizon)
+    feature_functions = generate_lag_feature_functions(feature_names, horizon)
 
     # Get timedrivenfeature functions
     feature_functions.update(
@@ -77,16 +82,15 @@ def apply_features(
     # Add the features to the dataframe using previously defined feature functions
     for key, featfunc in feature_functions.items():
         # Don't generate feature is not in features
-        if features is not None:
-            if key not in features:
-                continue
+        if feature_names is not None and key not in feature_names:
+            continue
         data[key] = data.iloc[:, [0]].apply(featfunc)
 
     # Add additional wind features
-    data = add_additional_wind_features(data, features)
+    data = add_additional_wind_features(data, feature_names)
 
     # Add humidity features
-    data = add_humidity_features(data, features)
+    data = add_humidity_features(data, feature_names)
 
     # Return dataframe including all requested features
     return data
diff --git a/openstf/feature_engineering/feature_applicator.py b/openstf/feature_engineering/feature_applicator.py
@@ -2,31 +2,36 @@
 #
 # SPDX-License-Identifier: MPL-2.0
 from abc import ABC, abstractmethod
+from typing import List, Optional
 
 import numpy as np
 import pandas as pd
 
 from openstf.feature_engineering.apply_features import apply_features
 from openstf.feature_engineering.general import (
     add_missing_feature_columns,
-    remove_extra_feature_columns,
+    remove_non_requested_feature_columns,
+    enforce_feature_order,
 )
 
 LATENCY_CONFIG = {"APX": 24}  # A specific latency is part of a specific feature.
 
 
 class AbstractFeatureApplicator(ABC):
-    def __init__(self, horizons: list, features: list = None) -> None:
+    def __init__(
+        self, horizons: List[float], feature_names: Optional[List[str]] = None
+    ) -> None:
         """Initialize abstract feature applicator.
 
         Args:
-            horizons: (list) list of horizons
-            features: (list) List of requested features
+            horizons (list): list of horizons
+            feature_names (List[str]):  List of requested features
         """
-        if type(horizons) is not list:
-            raise ValueError("Horizons must be added as a list")
+        if type(horizons) is not list and not None:
+            raise ValueError("horizons must be added as a list")
+
+        self.feature_names = feature_names
         self.horizons = horizons
-        self.features = features
 
     @abstractmethod
     def add_features(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -39,7 +44,7 @@ def add_features(self, df: pd.DataFrame) -> pd.DataFrame:
 
 
 class TrainFeatureApplicator(AbstractFeatureApplicator):
-    def add_features(self, df: pd.DataFrame) -> pd.DataFrame:
+    def add_features(self, df: pd.DataFrame, latency_config=None) -> pd.DataFrame:
         """Adds features to an input DataFrame.
 
         This method is implemented specifically for a model train pipeline. For larger
@@ -52,11 +57,17 @@ def add_features(self, df: pd.DataFrame) -> pd.DataFrame:
 
         Args:
             df (pd.DataFrame):  Input data to which the features will be added.
+            latency_config (dict): Optional. Invalidate certain features that are not
+                available for a specific horizon due to data latency. Default to
+                {"APX": 24}
 
         Returns:
             pd.DataFrame: Input DataFrame with an extra column for every added feature.
         """
 
+        if latency_config is None:
+            latency_config = LATENCY_CONFIG
+
         # Set default horizons if none are provided
         if self.horizons is None:
             self.horizons = [0.25, 24]
@@ -69,15 +80,16 @@ def add_features(self, df: pd.DataFrame) -> pd.DataFrame:
             res = apply_features(
                 df.copy(deep=True), horizon=horizon
             )  # Deep copy of df is important, because we want a fresh start every iteration!
-            res["Horizon"] = horizon
+            res["horizon"] = horizon
             result = result.append(res)
 
         # Invalidate features that are not available for a specific horizon due to data
         # latency
-        for feature, time in LATENCY_CONFIG.items():
-            result.loc[result["Horizon"] > time, feature] = np.nan
+        for feature, time in latency_config.items():
+            result.loc[result["horizon"] > time, feature] = np.nan
 
-        return result.sort_index()
+        # Sort all features except for the (first) load and (last) horizon columns
+        return enforce_feature_order(result)
 
 
 class OperationalPredictFeatureApplicator(AbstractFeatureApplicator):
@@ -98,31 +110,11 @@ def add_features(self, df: pd.DataFrame) -> pd.DataFrame:
         if num_horizons != 1:
             raise ValueError("Expected one horizon, got {num_horizons}")
 
-        df = apply_features(df, features=self.features, horizon=self.horizons[0])
-        df = add_missing_feature_columns(df, self.features)
-        df = remove_extra_feature_columns(df, self.features)
-
-        return df
-
-
-class BackTestPredictFeatureApplicator(AbstractFeatureApplicator):
-    def add_features(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Adds features to an input DataFrame.
-
-        This method is implemented specifically for a backtest prediction for a specific horizon.
-        All featurs that are not available for the specific horzion are invalidated.
-
-        Args:
-            df: pd.DataFrame with input data to which the features have to be added
-
-        Returns:
-            pd.DataFrame: Input DataFrame with an extra column for every added feature.
-        """
-        num_horizons = len(self.horizons)
-        if num_horizons != 1:
-            raise ValueError("Expected one horizon, got {num_horizons}")
+        df = apply_features(
+            df, feature_names=self.feature_names, horizon=self.horizons[0]
+        )
+        df = add_missing_feature_columns(df, self.feature_names)
+        # NOTE this is required since apply_features could add additional features
+        df = remove_non_requested_feature_columns(df, self.feature_names)
 
-        df = apply_features(df, horizon=self.horizons[0])
-        df = add_missing_feature_columns(df, self.features)
-        df = remove_extra_feature_columns(df, self.features)
-        return df
+        return enforce_feature_order(df)
diff --git a/openstf/feature_engineering/general.py b/openstf/feature_engineering/general.py
@@ -3,30 +3,36 @@
 # SPDX-License-Identifier: MPL-2.0
 
 # -*- coding: utf-8 -*-
+from typing import List
 import numpy as np
 import pandas as pd
 import structlog
 
-logger = structlog.get_logger(__name__)
-
 
 def add_missing_feature_columns(
-    input_data: pd.DataFrame, featurelist: list
+    input_data: pd.DataFrame, features: List[str]
 ) -> pd.DataFrame:
     """Adds feature column for features in the featurelist.
 
     Add feature columns for features in the feature list if these columns don't
     exist in the input data. If a column is added, its value is set to NaN.
-    This is especially usefull to make sure the required columns are in place when making a prediction.
+    This is especially usefull to make sure the required columns are in place when
+    making a prediction.
 
     NOTE: this function is intended as a final check to prevent errors during predicion.
         In an ideal world this function is not nescarry.
 
     Args:
         input_data (pd.DataFrame): DataFrame with input data and featurs.
-        featurelist (list): List of requiered features
+        features (list): List of requiered features.
     """
-    missing_features = [f for f in featurelist if f not in list(input_data)]
+
+    logger = structlog.get_logger(__name__)
+
+    if features is None:
+        features = []
+
+    missing_features = [f for f in features if f not in list(input_data)]
 
     for feature in missing_features:
         logger.warning(
@@ -37,27 +43,81 @@ def add_missing_feature_columns(
     return input_data
 
 
-def remove_extra_feature_columns(
-    input_data: pd.DataFrame, featurelist: list
+def remove_non_requested_feature_columns(
+    input_data: pd.DataFrame, requested_features: List[str]
 ) -> pd.DataFrame:
-    """Removes any features that are provided in the input data but not in the feature list.
+    """Removes features that are provided in the input data but not in the feature list.
+
     This should not be nescesarry but serves as an extra failsave for making predicitons
 
     Args:
         input_data: (pd.Dataframe) DataFrame with features
-        featurelist: (list) list of reuqested features
-
-    Returns: pd.DataFrame with model input data and fetaures
+        requested_features: (list) list of reuqested features
 
+    Returns:
+        pd.DataFrame: Nodel input data with features.
     """
-    extra_features = [f for f in list(input_data) if f not in featurelist]
+    logger = structlog.get_logger(__name__)
 
-    num_not_requested_features = len(extra_features)
+    if requested_features is None:
+        requested_features = []
+
+    not_requested_features = [
+        f for f in list(input_data) if f not in requested_features
+    ]
+
+    # Do not see "load" or "horizon" as an extra feature as it is no feature
+    if "load" in not_requested_features:
+        not_requested_features.remove("load")
+
+    num_not_requested_features = len(not_requested_features)
 
     if num_not_requested_features != 0:
         logger.warning(
             f"Removing {num_not_requested_features} unrequested features!",
             num_not_requested_features=num_not_requested_features,
         )
 
-    return input_data.drop(extra_features, axis=1)
+    return input_data.drop(not_requested_features, axis=1)
+
+
+def enforce_feature_order(input_data: pd.DataFrame):
+    """Enforces correct order of features.
+
+    Alphabetically orders the feature columns. The load column remains the first column
+        and the horizons column remains the last column.
+        Everything in between is alphabetically sorted:
+        The order eventually looks like this:
+        ["load"] -- [alphabetically sorted features] -- ['horizon']
+
+        This funciton assumes the first column contains the to be predicted variable
+        Furthermore the "horizon" is moved to the last position if it is pressent.
+
+    Args:
+        input_data (pd.DataFrame): Input data with features.
+
+    Returns:
+        pd.DataFrame: Properly sorted input data
+    """
+
+    # Extract first column name
+    first_column_name = input_data.columns.to_list()[
+        0
+    ]  # Most of the time this is "load"
+
+    # Sort columns
+    columns = list(np.sort(input_data.columns.to_list()))
+
+    # Remove first column and add to the start
+    columns.remove(first_column_name)
+    column_order = [first_column_name] + columns
+
+    # If "Horzion" column is available add to the end
+    if "horizon" in columns:
+        # "horizon" is pressent in the training procces
+        # but not in the forecasting process
+        column_order.remove("horizon")
+        column_order = column_order + ["horizon"]
+
+    # Return dataframe with columns in the correct order
+    return input_data.loc[:, column_order]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -158,3 +158,4 @@ dmypy.json

		# Cython debug symbols
		cython_debug/
		/test/reports/