Skip to content

Commit

Permalink
Merge pull request #110 from alliander-opensource/features-KTPS-1368-…
Browse files Browse the repository at this point in the history
…implement-minimal-sklearn

Features ktps 1368 implement minimal sklearn
  • Loading branch information
JanMaartenvanDoorn authored Jul 1, 2021
2 parents 19f0ce0 + 5ed51d4 commit 6fbf4f2
Show file tree
Hide file tree
Showing 136 changed files with 17,331 additions and 4,668 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docs-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ jobs:
with:
pre-build-command: |
cp requirements.txt docs/requirements.txt
echo "sphinx_rtd_theme" >> docs/requirements.txt
echo -e "\nsphinx_rtd_theme" >> docs/requirements.txt
sphinx-apidoc -o docs openstf
docs-folder: "docs/"
2 changes: 1 addition & 1 deletion .github/workflows/docs-publish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
with:
pre-build-command: |
cp requirements.txt docs/requirements.txt
echo "sphinx_rtd_theme" >> docs/requirements.txt
echo -e "\nsphinx_rtd_theme" >> docs/requirements.txt
sphinx-apidoc -o docs openstf
docs-folder: "docs/"
# Upload artifact so it is available from the action-window
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,4 @@ dmypy.json

# Cython debug symbols
cython_debug/
/test/reports/
7 changes: 7 additions & 0 deletions openstf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,12 @@
# SPDX-License-Identifier: MPL-2.0

from pathlib import Path
from importlib.metadata import version, PackageNotFoundError

PROJECT_ROOT = Path(__file__).parent.parent.absolute()

try:
__version__ = version("openstf")
except PackageNotFoundError:
# package is not installed
pass
8 changes: 7 additions & 1 deletion openstf/enums.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: 2017-2021 Alliander N.V. <korte.termijn.prognoses@alliander.com> # noqa E501>
#
# SPDX-License-Identifier: MPL-2.0
from enum import Enum
from enum import Enum, auto


# TODO replace this with ModelType (MLModelType == Machine Learning model type)
Expand All @@ -16,3 +16,9 @@ class ForecastType(Enum):
WIND = "wind"
SOLAR = "solar"
BASECASE = "basecase"


class TracyJobResult(Enum):
SUCCESS = "success"
FAILED = "failed"
UNKNOWN = "unknown"
22 changes: 13 additions & 9 deletions openstf/feature_engineering/apply_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
The normalised wind power according to the turbine-specific power curve
"""
from typing import List

import pandas as pd

from openstf.feature_engineering.holiday_features import (
Expand All @@ -26,19 +28,23 @@


def apply_features(
data: pd.DataFrame, features: list = None, horizon: float = 24.0
data: pd.DataFrame, feature_names: List[str] = None, horizon: float = 24.0
) -> pd.DataFrame:
"""This script applies the feature functions defined in
feature_functions.py and returns the complete dataframe. Features requiring
more recent label-data are omitted.
NOTE: For the time deriven features only the onces in the features list
will be added. But for the weather features all will be added at present.
These unrequested additional features have to be filtered out later.
Args:
data (pandas.DataFrame): a pandas dataframe with input data in the form:
pd.DataFrame(
index=datetime,
columns=[label, predictor_1,..., predictor_n]
)
features (list of strs): list of reuqested features
feature_names (List[str]): list of reuqested features
horizon (float): Forecast horizon limit in hours.
Returns:
Expand All @@ -56,9 +62,8 @@ def apply_features(
np.random.uniform(0.7,1.7, 200)))
"""

# Get lag feature functions
feature_functions = generate_lag_feature_functions(features, horizon)
feature_functions = generate_lag_feature_functions(feature_names, horizon)

# Get timedrivenfeature functions
feature_functions.update(
Expand All @@ -77,16 +82,15 @@ def apply_features(
# Add the features to the dataframe using previously defined feature functions
for key, featfunc in feature_functions.items():
# Don't generate feature is not in features
if features is not None:
if key not in features:
continue
if feature_names is not None and key not in feature_names:
continue
data[key] = data.iloc[:, [0]].apply(featfunc)

# Add additional wind features
data = add_additional_wind_features(data, features)
data = add_additional_wind_features(data, feature_names)

# Add humidity features
data = add_humidity_features(data, features)
data = add_humidity_features(data, feature_names)

# Return dataframe including all requested features
return data
70 changes: 31 additions & 39 deletions openstf/feature_engineering/feature_applicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,36 @@
#
# SPDX-License-Identifier: MPL-2.0
from abc import ABC, abstractmethod
from typing import List, Optional

import numpy as np
import pandas as pd

from openstf.feature_engineering.apply_features import apply_features
from openstf.feature_engineering.general import (
add_missing_feature_columns,
remove_extra_feature_columns,
remove_non_requested_feature_columns,
enforce_feature_order,
)

LATENCY_CONFIG = {"APX": 24} # A specific latency is part of a specific feature.


class AbstractFeatureApplicator(ABC):
def __init__(self, horizons: list, features: list = None) -> None:
def __init__(
self, horizons: List[float], feature_names: Optional[List[str]] = None
) -> None:
"""Initialize abstract feature applicator.
Args:
horizons: (list) list of horizons
features: (list) List of requested features
horizons (list): list of horizons
feature_names (List[str]): List of requested features
"""
if type(horizons) is not list:
raise ValueError("Horizons must be added as a list")
if type(horizons) is not list and not None:
raise ValueError("horizons must be added as a list")

self.feature_names = feature_names
self.horizons = horizons
self.features = features

@abstractmethod
def add_features(self, df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -39,7 +44,7 @@ def add_features(self, df: pd.DataFrame) -> pd.DataFrame:


class TrainFeatureApplicator(AbstractFeatureApplicator):
def add_features(self, df: pd.DataFrame) -> pd.DataFrame:
def add_features(self, df: pd.DataFrame, latency_config=None) -> pd.DataFrame:
"""Adds features to an input DataFrame.
This method is implemented specifically for a model train pipeline. For larger
Expand All @@ -52,11 +57,17 @@ def add_features(self, df: pd.DataFrame) -> pd.DataFrame:
Args:
df (pd.DataFrame): Input data to which the features will be added.
latency_config (dict): Optional. Invalidate certain features that are not
available for a specific horizon due to data latency. Default to
{"APX": 24}
Returns:
pd.DataFrame: Input DataFrame with an extra column for every added feature.
"""

if latency_config is None:
latency_config = LATENCY_CONFIG

# Set default horizons if none are provided
if self.horizons is None:
self.horizons = [0.25, 24]
Expand All @@ -69,15 +80,16 @@ def add_features(self, df: pd.DataFrame) -> pd.DataFrame:
res = apply_features(
df.copy(deep=True), horizon=horizon
) # Deep copy of df is important, because we want a fresh start every iteration!
res["Horizon"] = horizon
res["horizon"] = horizon
result = result.append(res)

# Invalidate features that are not available for a specific horizon due to data
# latency
for feature, time in LATENCY_CONFIG.items():
result.loc[result["Horizon"] > time, feature] = np.nan
for feature, time in latency_config.items():
result.loc[result["horizon"] > time, feature] = np.nan

return result.sort_index()
# Sort all features except for the (first) load and (last) horizon columns
return enforce_feature_order(result)


class OperationalPredictFeatureApplicator(AbstractFeatureApplicator):
Expand All @@ -98,31 +110,11 @@ def add_features(self, df: pd.DataFrame) -> pd.DataFrame:
if num_horizons != 1:
raise ValueError("Expected one horizon, got {num_horizons}")

df = apply_features(df, features=self.features, horizon=self.horizons[0])
df = add_missing_feature_columns(df, self.features)
df = remove_extra_feature_columns(df, self.features)

return df


class BackTestPredictFeatureApplicator(AbstractFeatureApplicator):
def add_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Adds features to an input DataFrame.
This method is implemented specifically for a backtest prediction for a specific horizon.
All featurs that are not available for the specific horzion are invalidated.
Args:
df: pd.DataFrame with input data to which the features have to be added
Returns:
pd.DataFrame: Input DataFrame with an extra column for every added feature.
"""
num_horizons = len(self.horizons)
if num_horizons != 1:
raise ValueError("Expected one horizon, got {num_horizons}")
df = apply_features(
df, feature_names=self.feature_names, horizon=self.horizons[0]
)
df = add_missing_feature_columns(df, self.feature_names)
# NOTE this is required since apply_features could add additional features
df = remove_non_requested_feature_columns(df, self.feature_names)

df = apply_features(df, horizon=self.horizons[0])
df = add_missing_feature_columns(df, self.features)
df = remove_extra_feature_columns(df, self.features)
return df
return enforce_feature_order(df)
90 changes: 75 additions & 15 deletions openstf/feature_engineering/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,36 @@
# SPDX-License-Identifier: MPL-2.0

# -*- coding: utf-8 -*-
from typing import List
import numpy as np
import pandas as pd
import structlog

logger = structlog.get_logger(__name__)


def add_missing_feature_columns(
input_data: pd.DataFrame, featurelist: list
input_data: pd.DataFrame, features: List[str]
) -> pd.DataFrame:
"""Adds feature column for features in the featurelist.
Add feature columns for features in the feature list if these columns don't
exist in the input data. If a column is added, its value is set to NaN.
This is especially usefull to make sure the required columns are in place when making a prediction.
This is especially usefull to make sure the required columns are in place when
making a prediction.
NOTE: this function is intended as a final check to prevent errors during predicion.
In an ideal world this function is not nescarry.
Args:
input_data (pd.DataFrame): DataFrame with input data and featurs.
featurelist (list): List of requiered features
features (list): List of requiered features.
"""
missing_features = [f for f in featurelist if f not in list(input_data)]

logger = structlog.get_logger(__name__)

if features is None:
features = []

missing_features = [f for f in features if f not in list(input_data)]

for feature in missing_features:
logger.warning(
Expand All @@ -37,27 +43,81 @@ def add_missing_feature_columns(
return input_data


def remove_extra_feature_columns(
input_data: pd.DataFrame, featurelist: list
def remove_non_requested_feature_columns(
input_data: pd.DataFrame, requested_features: List[str]
) -> pd.DataFrame:
"""Removes any features that are provided in the input data but not in the feature list.
"""Removes features that are provided in the input data but not in the feature list.
This should not be nescesarry but serves as an extra failsave for making predicitons
Args:
input_data: (pd.Dataframe) DataFrame with features
featurelist: (list) list of reuqested features
Returns: pd.DataFrame with model input data and fetaures
requested_features: (list) list of reuqested features
Returns:
pd.DataFrame: Nodel input data with features.
"""
extra_features = [f for f in list(input_data) if f not in featurelist]
logger = structlog.get_logger(__name__)

num_not_requested_features = len(extra_features)
if requested_features is None:
requested_features = []

not_requested_features = [
f for f in list(input_data) if f not in requested_features
]

# Do not see "load" or "horizon" as an extra feature as it is no feature
if "load" in not_requested_features:
not_requested_features.remove("load")

num_not_requested_features = len(not_requested_features)

if num_not_requested_features != 0:
logger.warning(
f"Removing {num_not_requested_features} unrequested features!",
num_not_requested_features=num_not_requested_features,
)

return input_data.drop(extra_features, axis=1)
return input_data.drop(not_requested_features, axis=1)


def enforce_feature_order(input_data: pd.DataFrame):
"""Enforces correct order of features.
Alphabetically orders the feature columns. The load column remains the first column
and the horizons column remains the last column.
Everything in between is alphabetically sorted:
The order eventually looks like this:
["load"] -- [alphabetically sorted features] -- ['horizon']
This funciton assumes the first column contains the to be predicted variable
Furthermore the "horizon" is moved to the last position if it is pressent.
Args:
input_data (pd.DataFrame): Input data with features.
Returns:
pd.DataFrame: Properly sorted input data
"""

# Extract first column name
first_column_name = input_data.columns.to_list()[
0
] # Most of the time this is "load"

# Sort columns
columns = list(np.sort(input_data.columns.to_list()))

# Remove first column and add to the start
columns.remove(first_column_name)
column_order = [first_column_name] + columns

# If "Horzion" column is available add to the end
if "horizon" in columns:
# "horizon" is pressent in the training procces
# but not in the forecasting process
column_order.remove("horizon")
column_order = column_order + ["horizon"]

# Return dataframe with columns in the correct order
return input_data.loc[:, column_order]
Loading

0 comments on commit 6fbf4f2

Please sign in to comment.