Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preprocessing Functions for Time-Based Windowing, Differencing, and Data Transformation. Xgboost for GridSearch #54

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,5 @@ dmypy.json
# Pyre type checker
.pyre/

# Temporary files
temp.py
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"editor.formatOnSave": false
}
16 changes: 15 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,18 @@ To apply the new logic in the library, it will be necessary to implement the use

### Additional Features

Preferably, use the `_bibmon_tools.py` file to implement additional features.
Preferably, use the `_bibmon_tools.py` file to implement additional features.

### Testing New Functionalities

The first step to add new functionalities is to download the testing libraries. To do this, run the following command:

```bash
pip install -r test/requirements.txt
```

After implementing the new functionalities, run the tests to ensure that the new code is working correctly. To do this, run the following command:

```bash
pytest
```
4 changes: 2 additions & 2 deletions bibmon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
from ._sbm import SBM
from ._sklearn_regressor import sklearnRegressor
from ._preprocess import PreProcess
from ._load_data import load_tennessee_eastman, load_real_data
from ._load_data import load_tennessee_eastman, load_real_data, load_3w
from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows

__all__ = ['Autoencoder','PCA','ESN','SBM',
'sklearnRegressor', 'PreProcess',
'load_tennessee_eastman', 'load_real_data',
'load_tennessee_eastman', 'load_real_data', 'load_3w',
'train_val_test_split', 'complete_analysis', 'comparative_table',
'spearmanr_dendrogram', 'create_df_with_dates',
'create_df_with_noise', 'align_dfs_by_rows']
85 changes: 84 additions & 1 deletion bibmon/_bibmon_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from typing import Literal

###############################################################################

Expand Down Expand Up @@ -692,4 +693,86 @@ def comparative_table (models, X_train, X_validation, X_test,

return_tables.append(times_df)

return return_tables
return return_tables

##############################################################################

def find_df_transitions(
df: pd.DataFrame,
threshold: float = 1,
data_type: Literal["string", "number"] = "number",
label: str = None,
) -> list[int]:
"""
Finds transitions in a DataFrame. This can be used to find indices of interesting events in the data.

Parameters
----------
df: pandas.DataFrame
Data to be analyzed.
threshold: float, optional
Threshold to be used in the transition detection, this is the minimum difference between two consecutive points. Will be used only if data_type is 'number'.
data_type: str, optional
Type of data to be analyzed. If 'number', the threshold will be used to detect transitions. If 'string', the function will look for changes in the values.
label: str
Label to be used in the transition detection.

Returns
----------
: list of ints
Indices of the transitions.
"""

if label is None:
return []

transitions = []
previous_event = df[label].iloc[0]

for i in range(1, len(df)):
if data_type == "number":
if abs(df[label].iloc[i] - previous_event) > threshold:
transitions.append(i)
previous_event = df[label].iloc[i]
elif data_type == "string":
if df[label].iloc[i] != previous_event:
transitions.append(i)
previous_event = df[label].iloc[i]

return transitions

###############################################################################

def split_df_percentages(df: pd.DataFrame, percentages: list[float]) -> list[pd.DataFrame]:
"""
Splits a DataFrame into multiple DataFrames according to the given percentages, the sum of percentages must equal 1.

For example, if percentage = [0.6, 0.2, 0.2], the function will return a list with three DataFrames, the first one with 60% of the data, the second one with 20% and the third one with 20%.

Warning: This function may cause data loss if the split cannot be done exactly according to the percentages.

Parameters
----------
df: pandas.DataFrame
Data to be split.
percentages: list of floats
List of percentages to be used in the split.

Returns
----------
: list of pandas.DataFrames
List with the split DataFrames.
"""

if sum(percentages) != 1:
raise ValueError("The sum of the percentages must be 1.")

split_dfs = []
start = 0

for i in range(len(percentages)):
end = start + int(percentages[i] * len(df))
split_dfs.append(df.iloc[start:end])
start = end

return split_dfs
Loading