-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feature(Analysis): A new
analysis
module that explores features on …
…a rolling basis. `EDA` with finer insights (#151) * feature(Analysis): A new `analysis` module that explores features on a rolling basis. `EDA` with finer insights. * fix: Export. * fix: Type import didn't support python 3.7 * feature: Added plotly as a plotting dependency. * feature: Added networkx as dependency. * fix: Took out step size from default value so that it supports python 3.7 (pandas that support step needs python 3.8 or above). * feature: Support of rolling for older pandas. * feature: Docstring for mkdocs-gallery. * chore: Location of dataset module. * chore: This is not needed. * feature: Added option for notebook interactive.
- Loading branch information
Showing
11 changed files
with
368 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
::: krisi.evaluate.dataset | ||
::: krisi.analyse.dataset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
""" | ||
Analysing rolling correlations | ||
=========================== | ||
""" | ||
# mkdocs_gallery_thumbnail_path = 'images/example_thumnail.png' | ||
|
||
|
||
import os | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from krisi.analyse.correlations import ( | ||
create_rolled_correlation_metrics, | ||
create_summaries, | ||
get_corr_rolled, | ||
) | ||
from krisi.report.graph import create_save_graphs | ||
|
||
measurement_per_hour = 4 | ||
hours_per_day = 24 | ||
days_per_week = 7 | ||
one_day_measurement = measurement_per_hour * hours_per_day | ||
one_week_measurement = one_day_measurement * days_per_week | ||
|
||
window = one_day_measurement | ||
step = 1 | ||
|
||
df = pd.read_csv( | ||
"https://raw.githubusercontent.com/dream-faster/datasets/main/datasets/energy/industrial_pv_load.csv", | ||
parse_dates=["datetime"], | ||
index_col="datetime", | ||
) | ||
|
||
df_returns = df.pct_change().fillna(0) | ||
df_log_returns = pd.DataFrame(np.log(1 + df_returns)) | ||
|
||
|
||
for name, df_ in [ | ||
# ("raw", df), | ||
# ("returns", df_returns), | ||
("log_returns", df_log_returns), | ||
]: | ||
save_location = f"output/analyse/correlations/{name}" | ||
if not os.path.exists(save_location): | ||
os.makedirs(save_location) | ||
|
||
df_rolled_corr, df_rolled = get_corr_rolled(df_, window, step) | ||
( | ||
df_rolled, | ||
df_rolled_mean, | ||
df_rolled_mean_treshold, | ||
coor_unrolled_in_time, | ||
corr_std, | ||
corr_mean, | ||
) = create_rolled_correlation_metrics( | ||
df_rolled_corr, [df_.index[0] for df_ in df_rolled] | ||
) | ||
|
||
create_summaries( | ||
corr_std, | ||
corr_mean, | ||
coor_unrolled_in_time, | ||
window, | ||
step, | ||
name=name, | ||
save_or_display=["display"], | ||
save_location=save_location, | ||
) | ||
|
||
create_save_graphs([df_rolled_mean_treshold], save_location) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .dataset import check_consistency |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
from functools import reduce | ||
from typing import List, Optional, Tuple | ||
|
||
import matplotlib.pyplot as plt | ||
import pandas as pd | ||
from typing_extensions import Literal | ||
|
||
from .utils import corr_without_symmetry, unroll | ||
|
||
|
||
def get_unrolled_correlation_over_time( | ||
df_rolled_corr: List[pd.DataFrame], start_indecies: List[pd.DatetimeIndex] | ||
) -> pd.DataFrame: | ||
df_unrolled_in_time = pd.concat( | ||
[unroll(df_, start_indecies[i]) for i, df_ in enumerate(df_rolled_corr)], | ||
axis="columns", | ||
).T | ||
df_unrolled_in_time.index = pd.to_datetime(df_unrolled_in_time.index) | ||
return df_unrolled_in_time | ||
|
||
|
||
def get_mean_corr_matrix(df_rolled_corr: List[pd.DataFrame]) -> pd.DataFrame: | ||
df_rolled_summed = reduce(lambda x, y: x.add(y, fill_value=0), df_rolled_corr) | ||
df_rolled_mean = df_rolled_summed / len(df_rolled_corr) | ||
|
||
return df_rolled_mean | ||
|
||
|
||
def get_corr_rolled( | ||
df: pd.DataFrame, window: int, step: Optional[int] = None | ||
) -> Tuple[List[pd.DataFrame], List[pd.DataFrame]]: | ||
if step is None or step == 1 or step == 0: | ||
df_rolled = [df_ for df_ in df.rolling(window=window)][1:] | ||
else: | ||
df_rolled = [df_ for df_ in df.rolling(window=window, step=step)][1:] | ||
df_rolled_corr = [corr_without_symmetry(df_) for df_ in df_rolled] | ||
|
||
return df_rolled_corr, df_rolled | ||
|
||
|
||
def create_rolled_correlation_metrics( | ||
df_rolled_corr: List[pd.DataFrame], | ||
start_indecies: List[pd.DatetimeIndex], | ||
threshold: float = 0.25, | ||
) -> Tuple[ | ||
List[pd.DataFrame], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.Series, pd.Series | ||
]: | ||
coor_unrolled_in_time = get_unrolled_correlation_over_time( | ||
df_rolled_corr, start_indecies | ||
) | ||
|
||
# Correlations properties on unrolled correlation data | ||
corr_std = coor_unrolled_in_time.std() | ||
corr_mean = coor_unrolled_in_time.mean() | ||
|
||
# Mean correlations in a matrix for plotting into a network graph | ||
mean_corr_over_time = get_mean_corr_matrix(df_rolled_corr) | ||
|
||
mean_corr_over_time_treshold = mean_corr_over_time.where( | ||
mean_corr_over_time >= threshold, other=0.0 | ||
) | ||
return ( | ||
df_rolled_corr, | ||
mean_corr_over_time, | ||
mean_corr_over_time_treshold, | ||
coor_unrolled_in_time, | ||
corr_std, | ||
corr_mean, | ||
) | ||
|
||
|
||
def create_summaries( | ||
corr_std: pd.Series, | ||
corr_mean: pd.Series, | ||
df_unrolled_in_time: pd.DataFrame, | ||
window: int, | ||
step: int, | ||
name: str, | ||
save_or_display: List[Literal["save", "display"]] = ["display"], | ||
save_location: str = "structured_data/plots", | ||
top_k: int = 2, | ||
) -> None: | ||
fig, ax = plt.subplots(2, 1) | ||
title = f"Summary of {name} | window - {window} ~ step - {step}" | ||
fig.suptitle(title) | ||
|
||
# ranking = corr_mean/corr_std | ||
|
||
df_unrolled_in_time[corr_std.nsmallest(top_k).index.values].plot( | ||
title=f"Correlation over time (for top-{top_k} smallest standard deviation)", | ||
ax=ax[0], | ||
xticks=[], | ||
) | ||
df_unrolled_in_time[corr_mean.nlargest(top_k).index.values].plot( | ||
title=f"Correlation over time (for top-{top_k} largest mean)", | ||
ax=ax[1], | ||
) | ||
|
||
result = pd.concat([corr_std, corr_mean], axis="columns", keys=["std", "mean"]) | ||
result.index.name = title | ||
print(result) | ||
|
||
if "save" in save_or_display: | ||
path_friendly = title.replace(" ", "_") | ||
plt.savefig(f"{save_location}/{path_friendly}.png", format="PNG") | ||
if "display" in save_or_display: | ||
plt.show() |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
def corr_without_symmetry(df: pd.DataFrame) -> pd.DataFrame: | ||
corr = df.corr() | ||
return corr.mask(np.tril(np.ones(corr.shape)).astype(np.bool_)) | ||
|
||
|
||
def unroll(df: pd.DataFrame, datetime: pd.DatetimeIndex) -> pd.Series: | ||
df = df.melt(ignore_index=False).dropna() | ||
|
||
df.index = df.index + "_" + df.variable | ||
df = df.drop(columns=["variable"]) | ||
df = df.rename(columns={"value": datetime}) | ||
return df.squeeze() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
from typing import List | ||
|
||
import matplotlib.pyplot as plt | ||
import networkx as nx | ||
import numpy as np | ||
import pandas as pd | ||
from typing_extensions import Literal | ||
|
||
|
||
def create_save_graphs( | ||
df_rolled_corr: List[pd.DataFrame], | ||
save_or_display: List[Literal["save", "display"]] = ["save", "display"], | ||
corr_direction: Literal["positive", "negative"] = "positive", | ||
min_correlation: float = 0.01, | ||
save_location: str = "output/analyse/correlations", | ||
) -> None: | ||
for i, corr_df in enumerate(df_rolled_corr): | ||
__display_corr_graph( | ||
corr_df, | ||
save_or_display, | ||
corr_direction, | ||
min_correlation, | ||
save_location, | ||
file_name=str(i), | ||
) | ||
|
||
|
||
def __display_corr_graph( | ||
corr: pd.DataFrame, | ||
save_or_display: List[Literal["save", "display"]], | ||
corr_direction: Literal["positive", "negative"], | ||
min_correlation: float, | ||
save_location: str, | ||
file_name: str = "0", | ||
) -> None: | ||
feature_node_names = corr.index.values | ||
matrix_array = np.array(corr) | ||
matrix_corr = np.matrix(matrix_array, copy=False, dtype=None) | ||
|
||
G = nx.Graph(matrix_corr) | ||
G = nx.relabel_nodes(G, lambda i: feature_node_names[i]) | ||
G.remove_edges_from(nx.selfloop_edges(G)) | ||
|
||
__create_corr_network( | ||
G, | ||
corr_direction=corr_direction, | ||
min_correlation=min_correlation, | ||
save_location=save_location, | ||
file_name=file_name, | ||
save_or_display=save_or_display, | ||
) | ||
|
||
|
||
def __create_corr_network( | ||
G: nx.Graph, | ||
corr_direction: Literal["positive", "negative"], | ||
min_correlation: float, | ||
save_location: str, | ||
file_name: str, | ||
save_or_display: List[Literal["save", "display"]], | ||
): | ||
# Creates a copy of the graph | ||
H = G.copy() | ||
|
||
# Checks all the edges and removes some based on corr_direction | ||
for stock1, stock2, weight in G.edges(data=True): | ||
# if we only want to see the positive correlations we then delete the edges with weight smaller than 0 | ||
if corr_direction == "positive": | ||
# it adds a minimum value for correlation. | ||
# If correlation weaker than the min, then it deletes the edge | ||
if weight["weight"] < 0 or weight["weight"] < min_correlation: | ||
H.remove_edge(stock1, stock2) | ||
# this part runs if the corr_direction is negative and removes edges with weights equal or largen than 0 | ||
else: | ||
# it adds a minimum value for correlation. | ||
# If correlation weaker than the min, then it deletes the edge | ||
if weight["weight"] >= 0 or weight["weight"] > min_correlation: | ||
H.remove_edge(stock1, stock2) | ||
|
||
# crates a list for edges and for the weights | ||
edges, weights = zip(*nx.get_edge_attributes(H, "weight").items()) | ||
|
||
# increases the value of weights, so that they are more visible in the graph | ||
weights = tuple([(1 + abs(x)) ** 2 for x in weights]) | ||
|
||
# calculates the degree of each node | ||
# d = nx.degree(H) | ||
# creates list of nodes and a list their degrees that will be used later for their sizes | ||
# nodelist, node_sizes = zip(*d.items()) | ||
|
||
# positions | ||
positions = nx.circular_layout(H) | ||
|
||
# Figure size | ||
plt.figure(figsize=(15, 15)) | ||
|
||
# draws nodes | ||
nx.draw_networkx_nodes( | ||
H, | ||
positions, | ||
node_color="#DA70D6", | ||
# nodelist=nodelist, | ||
# the node size will be now based on its degree | ||
node_size=5000, | ||
# node_size=tuple([x**3 for x in node_sizes]), | ||
alpha=0.8, | ||
) | ||
|
||
# Styling for labels | ||
nx.draw_networkx_labels(H, positions, font_size=21, font_family="sans-serif") | ||
|
||
# edge colors based on weight direction | ||
if corr_direction == "positive": | ||
edge_colour = plt.cm.GnBu | ||
else: | ||
edge_colour = plt.cm.PuRd | ||
|
||
# draws the edges | ||
nx.draw_networkx_edges( | ||
H, | ||
positions, | ||
edgelist=edges, | ||
style="solid", | ||
# adds width=weights and edge_color = weights | ||
# so that edges are based on the weight parameter | ||
# edge_cmap is for the color scale based on the weight | ||
# edge_vmin and edge_vmax assign the min and max weights for the width | ||
width=weights, | ||
edge_color=weights, | ||
edge_cmap=edge_colour, | ||
edge_vmin=min(weights), | ||
edge_vmax=max(weights), | ||
) | ||
|
||
plt.axis("off") | ||
|
||
if "save" in save_or_display: | ||
plt.savefig(f"{save_location}/{file_name}_{corr_direction}.png", format="PNG") | ||
if "display" in save_or_display: | ||
plt.show() | ||
|
||
|
||
def create_animation( | ||
path: str = "structured_data/plots", file_suffix: str = "positive" | ||
): | ||
import os | ||
|
||
import matplotlib.pyplot as plt | ||
from matplotlib.animation import FuncAnimation | ||
|
||
files = os.listdir(path) | ||
|
||
nframes = len(files) | ||
plt.subplots_adjust(top=1, bottom=0, left=0, right=1) | ||
|
||
def animate(i: int): | ||
im = plt.imread(f"{path}/{str(i)}_{file_suffix}.png") | ||
plt.imshow(im) | ||
|
||
anim = FuncAnimation( | ||
plt.gcf(), animate, frames=nframes, interval=(2000.0 / nframes) | ||
) | ||
anim.save(f"{path}/output.gif", writer="imagemagick") |
Oops, something went wrong.