Skip to content

Commit

Permalink
feature(Analysis): A new analysis module that explores features on …
Browse files Browse the repository at this point in the history
…a rolling basis. `EDA` with finer insights (#151)

* feature(Analysis): A new `analysis` module that explores features on a rolling basis. `EDA` with finer insights.

* fix: Export.

* fix: Type import didn't support python 3.7

* feature: Added plotly as a plotting dependency.

* feature: Added networkx as dependency.

* fix: Took out step size from default value so that it supports python 3.7 (pandas that support step needs python 3.8 or above).

* feature: Support of rolling for older pandas.

* feature: Docstring for mkdocs-gallery.

* chore: Location of dataset module.

* chore: This is not needed.

* feature: Added option for notebook interactive.
  • Loading branch information
szemyd authored May 2, 2023
1 parent a6aa738 commit 6c02f72
Show file tree
Hide file tree
Showing 11 changed files with 368 additions and 7 deletions.
2 changes: 1 addition & 1 deletion docs/api/dataset.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
::: krisi.evaluate.dataset
::: krisi.analyse.dataset
71 changes: 71 additions & 0 deletions docs/examples/analyse_rolling_corr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""
Analysing rolling correlations
===========================
"""
# mkdocs_gallery_thumbnail_path = 'images/example_thumnail.png'


import os

import numpy as np
import pandas as pd

from krisi.analyse.correlations import (
create_rolled_correlation_metrics,
create_summaries,
get_corr_rolled,
)
from krisi.report.graph import create_save_graphs

measurement_per_hour = 4
hours_per_day = 24
days_per_week = 7
one_day_measurement = measurement_per_hour * hours_per_day
one_week_measurement = one_day_measurement * days_per_week

window = one_day_measurement
step = 1

df = pd.read_csv(
"https://raw.githubusercontent.com/dream-faster/datasets/main/datasets/energy/industrial_pv_load.csv",
parse_dates=["datetime"],
index_col="datetime",
)

df_returns = df.pct_change().fillna(0)
df_log_returns = pd.DataFrame(np.log(1 + df_returns))


for name, df_ in [
# ("raw", df),
# ("returns", df_returns),
("log_returns", df_log_returns),
]:
save_location = f"output/analyse/correlations/{name}"
if not os.path.exists(save_location):
os.makedirs(save_location)

df_rolled_corr, df_rolled = get_corr_rolled(df_, window, step)
(
df_rolled,
df_rolled_mean,
df_rolled_mean_treshold,
coor_unrolled_in_time,
corr_std,
corr_mean,
) = create_rolled_correlation_metrics(
df_rolled_corr, [df_.index[0] for df_ in df_rolled]
)

create_summaries(
corr_std,
corr_mean,
coor_unrolled_in_time,
window,
step,
name=name,
save_or_display=["display"],
save_location=save_location,
)

create_save_graphs([df_rolled_mean_treshold], save_location)
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ plotting = [
"jupyter_dash",
"weasyprint",
"kaleido",
"pangocffi"
"pangocffi",
"matplotlib",
"networkx"
]

[tool.hatch.envs.quality]
Expand Down
1 change: 1 addition & 0 deletions src/krisi/analyse/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .dataset import check_consistency
107 changes: 107 additions & 0 deletions src/krisi/analyse/correlations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from functools import reduce
from typing import List, Optional, Tuple

import matplotlib.pyplot as plt
import pandas as pd
from typing_extensions import Literal

from .utils import corr_without_symmetry, unroll


def get_unrolled_correlation_over_time(
df_rolled_corr: List[pd.DataFrame], start_indecies: List[pd.DatetimeIndex]
) -> pd.DataFrame:
df_unrolled_in_time = pd.concat(
[unroll(df_, start_indecies[i]) for i, df_ in enumerate(df_rolled_corr)],
axis="columns",
).T
df_unrolled_in_time.index = pd.to_datetime(df_unrolled_in_time.index)
return df_unrolled_in_time


def get_mean_corr_matrix(df_rolled_corr: List[pd.DataFrame]) -> pd.DataFrame:
df_rolled_summed = reduce(lambda x, y: x.add(y, fill_value=0), df_rolled_corr)
df_rolled_mean = df_rolled_summed / len(df_rolled_corr)

return df_rolled_mean


def get_corr_rolled(
df: pd.DataFrame, window: int, step: Optional[int] = None
) -> Tuple[List[pd.DataFrame], List[pd.DataFrame]]:
if step is None or step == 1 or step == 0:
df_rolled = [df_ for df_ in df.rolling(window=window)][1:]
else:
df_rolled = [df_ for df_ in df.rolling(window=window, step=step)][1:]
df_rolled_corr = [corr_without_symmetry(df_) for df_ in df_rolled]

return df_rolled_corr, df_rolled


def create_rolled_correlation_metrics(
df_rolled_corr: List[pd.DataFrame],
start_indecies: List[pd.DatetimeIndex],
threshold: float = 0.25,
) -> Tuple[
List[pd.DataFrame], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.Series, pd.Series
]:
coor_unrolled_in_time = get_unrolled_correlation_over_time(
df_rolled_corr, start_indecies
)

# Correlations properties on unrolled correlation data
corr_std = coor_unrolled_in_time.std()
corr_mean = coor_unrolled_in_time.mean()

# Mean correlations in a matrix for plotting into a network graph
mean_corr_over_time = get_mean_corr_matrix(df_rolled_corr)

mean_corr_over_time_treshold = mean_corr_over_time.where(
mean_corr_over_time >= threshold, other=0.0
)
return (
df_rolled_corr,
mean_corr_over_time,
mean_corr_over_time_treshold,
coor_unrolled_in_time,
corr_std,
corr_mean,
)


def create_summaries(
corr_std: pd.Series,
corr_mean: pd.Series,
df_unrolled_in_time: pd.DataFrame,
window: int,
step: int,
name: str,
save_or_display: List[Literal["save", "display"]] = ["display"],
save_location: str = "structured_data/plots",
top_k: int = 2,
) -> None:
fig, ax = plt.subplots(2, 1)
title = f"Summary of {name} | window - {window} ~ step - {step}"
fig.suptitle(title)

# ranking = corr_mean/corr_std

df_unrolled_in_time[corr_std.nsmallest(top_k).index.values].plot(
title=f"Correlation over time (for top-{top_k} smallest standard deviation)",
ax=ax[0],
xticks=[],
)
df_unrolled_in_time[corr_mean.nlargest(top_k).index.values].plot(
title=f"Correlation over time (for top-{top_k} largest mean)",
ax=ax[1],
)

result = pd.concat([corr_std, corr_mean], axis="columns", keys=["std", "mean"])
result.index.name = title
print(result)

if "save" in save_or_display:
path_friendly = title.replace(" ", "_")
plt.savefig(f"{save_location}/{path_friendly}.png", format="PNG")
if "display" in save_or_display:
plt.show()
File renamed without changes.
16 changes: 16 additions & 0 deletions src/krisi/analyse/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import numpy as np
import pandas as pd


def corr_without_symmetry(df: pd.DataFrame) -> pd.DataFrame:
corr = df.corr()
return corr.mask(np.tril(np.ones(corr.shape)).astype(np.bool_))


def unroll(df: pd.DataFrame, datetime: pd.DatetimeIndex) -> pd.Series:
df = df.melt(ignore_index=False).dropna()

df.index = df.index + "_" + df.variable
df = df.drop(columns=["variable"])
df = df.rename(columns={"value": datetime})
return df.squeeze()
1 change: 0 additions & 1 deletion src/krisi/evaluate/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from .compare import compare
from .dataset import check_consistency
from .metric import Metric
from .score import score, score_in_outsample
from .scorecard import ScoreCard
Expand Down
4 changes: 1 addition & 3 deletions src/krisi/evaluate/scorecard.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,7 @@ def __init__(
self.__dict__["predictions"] = convert_to_series(predictions, "predictions")
self.__dict__["sample_type"] = sample_type
self.__dict__["rolling_args"] = (
rolling_args
if rolling_args is not None
else dict(window=len(y) // 100, step=len(y) // 100)
rolling_args if rolling_args is not None else dict(window=len(y) // 100)
)
self.__dict__["classification"] = (
is_dataset_classification_like(y)
Expand Down
163 changes: 163 additions & 0 deletions src/krisi/report/graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
from typing import List

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from typing_extensions import Literal


def create_save_graphs(
df_rolled_corr: List[pd.DataFrame],
save_or_display: List[Literal["save", "display"]] = ["save", "display"],
corr_direction: Literal["positive", "negative"] = "positive",
min_correlation: float = 0.01,
save_location: str = "output/analyse/correlations",
) -> None:
for i, corr_df in enumerate(df_rolled_corr):
__display_corr_graph(
corr_df,
save_or_display,
corr_direction,
min_correlation,
save_location,
file_name=str(i),
)


def __display_corr_graph(
corr: pd.DataFrame,
save_or_display: List[Literal["save", "display"]],
corr_direction: Literal["positive", "negative"],
min_correlation: float,
save_location: str,
file_name: str = "0",
) -> None:
feature_node_names = corr.index.values
matrix_array = np.array(corr)
matrix_corr = np.matrix(matrix_array, copy=False, dtype=None)

G = nx.Graph(matrix_corr)
G = nx.relabel_nodes(G, lambda i: feature_node_names[i])
G.remove_edges_from(nx.selfloop_edges(G))

__create_corr_network(
G,
corr_direction=corr_direction,
min_correlation=min_correlation,
save_location=save_location,
file_name=file_name,
save_or_display=save_or_display,
)


def __create_corr_network(
G: nx.Graph,
corr_direction: Literal["positive", "negative"],
min_correlation: float,
save_location: str,
file_name: str,
save_or_display: List[Literal["save", "display"]],
):
# Creates a copy of the graph
H = G.copy()

# Checks all the edges and removes some based on corr_direction
for stock1, stock2, weight in G.edges(data=True):
# if we only want to see the positive correlations we then delete the edges with weight smaller than 0
if corr_direction == "positive":
# it adds a minimum value for correlation.
# If correlation weaker than the min, then it deletes the edge
if weight["weight"] < 0 or weight["weight"] < min_correlation:
H.remove_edge(stock1, stock2)
# this part runs if the corr_direction is negative and removes edges with weights equal or largen than 0
else:
# it adds a minimum value for correlation.
# If correlation weaker than the min, then it deletes the edge
if weight["weight"] >= 0 or weight["weight"] > min_correlation:
H.remove_edge(stock1, stock2)

# crates a list for edges and for the weights
edges, weights = zip(*nx.get_edge_attributes(H, "weight").items())

# increases the value of weights, so that they are more visible in the graph
weights = tuple([(1 + abs(x)) ** 2 for x in weights])

# calculates the degree of each node
# d = nx.degree(H)
# creates list of nodes and a list their degrees that will be used later for their sizes
# nodelist, node_sizes = zip(*d.items())

# positions
positions = nx.circular_layout(H)

# Figure size
plt.figure(figsize=(15, 15))

# draws nodes
nx.draw_networkx_nodes(
H,
positions,
node_color="#DA70D6",
# nodelist=nodelist,
# the node size will be now based on its degree
node_size=5000,
# node_size=tuple([x**3 for x in node_sizes]),
alpha=0.8,
)

# Styling for labels
nx.draw_networkx_labels(H, positions, font_size=21, font_family="sans-serif")

# edge colors based on weight direction
if corr_direction == "positive":
edge_colour = plt.cm.GnBu
else:
edge_colour = plt.cm.PuRd

# draws the edges
nx.draw_networkx_edges(
H,
positions,
edgelist=edges,
style="solid",
# adds width=weights and edge_color = weights
# so that edges are based on the weight parameter
# edge_cmap is for the color scale based on the weight
# edge_vmin and edge_vmax assign the min and max weights for the width
width=weights,
edge_color=weights,
edge_cmap=edge_colour,
edge_vmin=min(weights),
edge_vmax=max(weights),
)

plt.axis("off")

if "save" in save_or_display:
plt.savefig(f"{save_location}/{file_name}_{corr_direction}.png", format="PNG")
if "display" in save_or_display:
plt.show()


def create_animation(
path: str = "structured_data/plots", file_suffix: str = "positive"
):
import os

import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

files = os.listdir(path)

nframes = len(files)
plt.subplots_adjust(top=1, bottom=0, left=0, right=1)

def animate(i: int):
im = plt.imread(f"{path}/{str(i)}_{file_suffix}.png")
plt.imshow(im)

anim = FuncAnimation(
plt.gcf(), animate, frames=nframes, interval=(2000.0 / nframes)
)
anim.save(f"{path}/output.gif", writer="imagemagick")
Loading

0 comments on commit 6c02f72

Please sign in to comment.