From 6c02f722f9b6a6f26bd05c756f1931be3e018ed8 Mon Sep 17 00:00:00 2001 From: Daniel Szemerey Date: Tue, 2 May 2023 18:25:35 +0200 Subject: [PATCH] feature(Analysis): A new `analysis` module that explores features on a rolling basis. `EDA` with finer insights (#151) * feature(Analysis): A new `analysis` module that explores features on a rolling basis. `EDA` with finer insights. * fix: Export. * fix: Type import didn't support python 3.7 * feature: Added plotly as a plotting dependency. * feature: Added networkx as dependency. * fix: Took out step size from default value so that it supports python 3.7 (pandas that support step needs python 3.8 or above). * feature: Support of rolling for older pandas. * feature: Docstring for mkdocs-gallery. * chore: Location of dataset module. * chore: This is not needed. * feature: Added option for notebook interactive. --- docs/api/dataset.md | 2 +- docs/examples/analyse_rolling_corr.py | 71 +++++++++ pyproject.toml | 4 +- src/krisi/analyse/__init__.py | 1 + src/krisi/analyse/correlations.py | 107 ++++++++++++++ src/krisi/{evaluate => analyse}/dataset.py | 0 src/krisi/analyse/utils.py | 16 ++ src/krisi/evaluate/__init__.py | 1 - src/krisi/evaluate/scorecard.py | 4 +- src/krisi/report/graph.py | 163 +++++++++++++++++++++ src/krisi/report/interactive.py | 6 +- 11 files changed, 368 insertions(+), 7 deletions(-) create mode 100644 docs/examples/analyse_rolling_corr.py create mode 100644 src/krisi/analyse/__init__.py create mode 100644 src/krisi/analyse/correlations.py rename src/krisi/{evaluate => analyse}/dataset.py (100%) create mode 100644 src/krisi/analyse/utils.py create mode 100644 src/krisi/report/graph.py diff --git a/docs/api/dataset.md b/docs/api/dataset.md index b6b0bb40..13b8a20f 100644 --- a/docs/api/dataset.md +++ b/docs/api/dataset.md @@ -1 +1 @@ -::: krisi.evaluate.dataset \ No newline at end of file +::: krisi.analyse.dataset \ No newline at end of file diff --git a/docs/examples/analyse_rolling_corr.py b/docs/examples/analyse_rolling_corr.py new file mode 100644 index 00000000..9b3ce76a --- /dev/null +++ b/docs/examples/analyse_rolling_corr.py @@ -0,0 +1,71 @@ +""" +Analysing rolling correlations +=========================== +""" +# mkdocs_gallery_thumbnail_path = 'images/example_thumnail.png' + + +import os + +import numpy as np +import pandas as pd + +from krisi.analyse.correlations import ( + create_rolled_correlation_metrics, + create_summaries, + get_corr_rolled, +) +from krisi.report.graph import create_save_graphs + +measurement_per_hour = 4 +hours_per_day = 24 +days_per_week = 7 +one_day_measurement = measurement_per_hour * hours_per_day +one_week_measurement = one_day_measurement * days_per_week + +window = one_day_measurement +step = 1 + +df = pd.read_csv( + "https://raw.githubusercontent.com/dream-faster/datasets/main/datasets/energy/industrial_pv_load.csv", + parse_dates=["datetime"], + index_col="datetime", +) + +df_returns = df.pct_change().fillna(0) +df_log_returns = pd.DataFrame(np.log(1 + df_returns)) + + +for name, df_ in [ + # ("raw", df), + # ("returns", df_returns), + ("log_returns", df_log_returns), +]: + save_location = f"output/analyse/correlations/{name}" + if not os.path.exists(save_location): + os.makedirs(save_location) + + df_rolled_corr, df_rolled = get_corr_rolled(df_, window, step) + ( + df_rolled, + df_rolled_mean, + df_rolled_mean_treshold, + coor_unrolled_in_time, + corr_std, + corr_mean, + ) = create_rolled_correlation_metrics( + df_rolled_corr, [df_.index[0] for df_ in df_rolled] + ) + + create_summaries( + corr_std, + corr_mean, + coor_unrolled_in_time, + window, + step, + name=name, + save_or_display=["display"], + save_location=save_location, + ) + + create_save_graphs([df_rolled_mean_treshold], save_location) diff --git a/pyproject.toml b/pyproject.toml index e9b82716..d8c27271 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,7 +83,9 @@ plotting = [ "jupyter_dash", "weasyprint", "kaleido", - "pangocffi" + "pangocffi", + "matplotlib", + "networkx" ] [tool.hatch.envs.quality] diff --git a/src/krisi/analyse/__init__.py b/src/krisi/analyse/__init__.py new file mode 100644 index 00000000..87e7ff48 --- /dev/null +++ b/src/krisi/analyse/__init__.py @@ -0,0 +1 @@ +from .dataset import check_consistency diff --git a/src/krisi/analyse/correlations.py b/src/krisi/analyse/correlations.py new file mode 100644 index 00000000..b4e76e66 --- /dev/null +++ b/src/krisi/analyse/correlations.py @@ -0,0 +1,107 @@ +from functools import reduce +from typing import List, Optional, Tuple + +import matplotlib.pyplot as plt +import pandas as pd +from typing_extensions import Literal + +from .utils import corr_without_symmetry, unroll + + +def get_unrolled_correlation_over_time( + df_rolled_corr: List[pd.DataFrame], start_indecies: List[pd.DatetimeIndex] +) -> pd.DataFrame: + df_unrolled_in_time = pd.concat( + [unroll(df_, start_indecies[i]) for i, df_ in enumerate(df_rolled_corr)], + axis="columns", + ).T + df_unrolled_in_time.index = pd.to_datetime(df_unrolled_in_time.index) + return df_unrolled_in_time + + +def get_mean_corr_matrix(df_rolled_corr: List[pd.DataFrame]) -> pd.DataFrame: + df_rolled_summed = reduce(lambda x, y: x.add(y, fill_value=0), df_rolled_corr) + df_rolled_mean = df_rolled_summed / len(df_rolled_corr) + + return df_rolled_mean + + +def get_corr_rolled( + df: pd.DataFrame, window: int, step: Optional[int] = None +) -> Tuple[List[pd.DataFrame], List[pd.DataFrame]]: + if step is None or step == 1 or step == 0: + df_rolled = [df_ for df_ in df.rolling(window=window)][1:] + else: + df_rolled = [df_ for df_ in df.rolling(window=window, step=step)][1:] + df_rolled_corr = [corr_without_symmetry(df_) for df_ in df_rolled] + + return df_rolled_corr, df_rolled + + +def create_rolled_correlation_metrics( + df_rolled_corr: List[pd.DataFrame], + start_indecies: List[pd.DatetimeIndex], + threshold: float = 0.25, +) -> Tuple[ + List[pd.DataFrame], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.Series, pd.Series +]: + coor_unrolled_in_time = get_unrolled_correlation_over_time( + df_rolled_corr, start_indecies + ) + + # Correlations properties on unrolled correlation data + corr_std = coor_unrolled_in_time.std() + corr_mean = coor_unrolled_in_time.mean() + + # Mean correlations in a matrix for plotting into a network graph + mean_corr_over_time = get_mean_corr_matrix(df_rolled_corr) + + mean_corr_over_time_treshold = mean_corr_over_time.where( + mean_corr_over_time >= threshold, other=0.0 + ) + return ( + df_rolled_corr, + mean_corr_over_time, + mean_corr_over_time_treshold, + coor_unrolled_in_time, + corr_std, + corr_mean, + ) + + +def create_summaries( + corr_std: pd.Series, + corr_mean: pd.Series, + df_unrolled_in_time: pd.DataFrame, + window: int, + step: int, + name: str, + save_or_display: List[Literal["save", "display"]] = ["display"], + save_location: str = "structured_data/plots", + top_k: int = 2, +) -> None: + fig, ax = plt.subplots(2, 1) + title = f"Summary of {name} | window - {window} ~ step - {step}" + fig.suptitle(title) + + # ranking = corr_mean/corr_std + + df_unrolled_in_time[corr_std.nsmallest(top_k).index.values].plot( + title=f"Correlation over time (for top-{top_k} smallest standard deviation)", + ax=ax[0], + xticks=[], + ) + df_unrolled_in_time[corr_mean.nlargest(top_k).index.values].plot( + title=f"Correlation over time (for top-{top_k} largest mean)", + ax=ax[1], + ) + + result = pd.concat([corr_std, corr_mean], axis="columns", keys=["std", "mean"]) + result.index.name = title + print(result) + + if "save" in save_or_display: + path_friendly = title.replace(" ", "_") + plt.savefig(f"{save_location}/{path_friendly}.png", format="PNG") + if "display" in save_or_display: + plt.show() diff --git a/src/krisi/evaluate/dataset.py b/src/krisi/analyse/dataset.py similarity index 100% rename from src/krisi/evaluate/dataset.py rename to src/krisi/analyse/dataset.py diff --git a/src/krisi/analyse/utils.py b/src/krisi/analyse/utils.py new file mode 100644 index 00000000..ead00338 --- /dev/null +++ b/src/krisi/analyse/utils.py @@ -0,0 +1,16 @@ +import numpy as np +import pandas as pd + + +def corr_without_symmetry(df: pd.DataFrame) -> pd.DataFrame: + corr = df.corr() + return corr.mask(np.tril(np.ones(corr.shape)).astype(np.bool_)) + + +def unroll(df: pd.DataFrame, datetime: pd.DatetimeIndex) -> pd.Series: + df = df.melt(ignore_index=False).dropna() + + df.index = df.index + "_" + df.variable + df = df.drop(columns=["variable"]) + df = df.rename(columns={"value": datetime}) + return df.squeeze() diff --git a/src/krisi/evaluate/__init__.py b/src/krisi/evaluate/__init__.py index 73c4ae73..e4a27d4f 100644 --- a/src/krisi/evaluate/__init__.py +++ b/src/krisi/evaluate/__init__.py @@ -1,5 +1,4 @@ from .compare import compare -from .dataset import check_consistency from .metric import Metric from .score import score, score_in_outsample from .scorecard import ScoreCard diff --git a/src/krisi/evaluate/scorecard.py b/src/krisi/evaluate/scorecard.py index 7040226a..370f0c4d 100644 --- a/src/krisi/evaluate/scorecard.py +++ b/src/krisi/evaluate/scorecard.py @@ -152,9 +152,7 @@ def __init__( self.__dict__["predictions"] = convert_to_series(predictions, "predictions") self.__dict__["sample_type"] = sample_type self.__dict__["rolling_args"] = ( - rolling_args - if rolling_args is not None - else dict(window=len(y) // 100, step=len(y) // 100) + rolling_args if rolling_args is not None else dict(window=len(y) // 100) ) self.__dict__["classification"] = ( is_dataset_classification_like(y) diff --git a/src/krisi/report/graph.py b/src/krisi/report/graph.py new file mode 100644 index 00000000..aa45bb5e --- /dev/null +++ b/src/krisi/report/graph.py @@ -0,0 +1,163 @@ +from typing import List + +import matplotlib.pyplot as plt +import networkx as nx +import numpy as np +import pandas as pd +from typing_extensions import Literal + + +def create_save_graphs( + df_rolled_corr: List[pd.DataFrame], + save_or_display: List[Literal["save", "display"]] = ["save", "display"], + corr_direction: Literal["positive", "negative"] = "positive", + min_correlation: float = 0.01, + save_location: str = "output/analyse/correlations", +) -> None: + for i, corr_df in enumerate(df_rolled_corr): + __display_corr_graph( + corr_df, + save_or_display, + corr_direction, + min_correlation, + save_location, + file_name=str(i), + ) + + +def __display_corr_graph( + corr: pd.DataFrame, + save_or_display: List[Literal["save", "display"]], + corr_direction: Literal["positive", "negative"], + min_correlation: float, + save_location: str, + file_name: str = "0", +) -> None: + feature_node_names = corr.index.values + matrix_array = np.array(corr) + matrix_corr = np.matrix(matrix_array, copy=False, dtype=None) + + G = nx.Graph(matrix_corr) + G = nx.relabel_nodes(G, lambda i: feature_node_names[i]) + G.remove_edges_from(nx.selfloop_edges(G)) + + __create_corr_network( + G, + corr_direction=corr_direction, + min_correlation=min_correlation, + save_location=save_location, + file_name=file_name, + save_or_display=save_or_display, + ) + + +def __create_corr_network( + G: nx.Graph, + corr_direction: Literal["positive", "negative"], + min_correlation: float, + save_location: str, + file_name: str, + save_or_display: List[Literal["save", "display"]], +): + # Creates a copy of the graph + H = G.copy() + + # Checks all the edges and removes some based on corr_direction + for stock1, stock2, weight in G.edges(data=True): + # if we only want to see the positive correlations we then delete the edges with weight smaller than 0 + if corr_direction == "positive": + # it adds a minimum value for correlation. + # If correlation weaker than the min, then it deletes the edge + if weight["weight"] < 0 or weight["weight"] < min_correlation: + H.remove_edge(stock1, stock2) + # this part runs if the corr_direction is negative and removes edges with weights equal or largen than 0 + else: + # it adds a minimum value for correlation. + # If correlation weaker than the min, then it deletes the edge + if weight["weight"] >= 0 or weight["weight"] > min_correlation: + H.remove_edge(stock1, stock2) + + # crates a list for edges and for the weights + edges, weights = zip(*nx.get_edge_attributes(H, "weight").items()) + + # increases the value of weights, so that they are more visible in the graph + weights = tuple([(1 + abs(x)) ** 2 for x in weights]) + + # calculates the degree of each node + # d = nx.degree(H) + # creates list of nodes and a list their degrees that will be used later for their sizes + # nodelist, node_sizes = zip(*d.items()) + + # positions + positions = nx.circular_layout(H) + + # Figure size + plt.figure(figsize=(15, 15)) + + # draws nodes + nx.draw_networkx_nodes( + H, + positions, + node_color="#DA70D6", + # nodelist=nodelist, + # the node size will be now based on its degree + node_size=5000, + # node_size=tuple([x**3 for x in node_sizes]), + alpha=0.8, + ) + + # Styling for labels + nx.draw_networkx_labels(H, positions, font_size=21, font_family="sans-serif") + + # edge colors based on weight direction + if corr_direction == "positive": + edge_colour = plt.cm.GnBu + else: + edge_colour = plt.cm.PuRd + + # draws the edges + nx.draw_networkx_edges( + H, + positions, + edgelist=edges, + style="solid", + # adds width=weights and edge_color = weights + # so that edges are based on the weight parameter + # edge_cmap is for the color scale based on the weight + # edge_vmin and edge_vmax assign the min and max weights for the width + width=weights, + edge_color=weights, + edge_cmap=edge_colour, + edge_vmin=min(weights), + edge_vmax=max(weights), + ) + + plt.axis("off") + + if "save" in save_or_display: + plt.savefig(f"{save_location}/{file_name}_{corr_direction}.png", format="PNG") + if "display" in save_or_display: + plt.show() + + +def create_animation( + path: str = "structured_data/plots", file_suffix: str = "positive" +): + import os + + import matplotlib.pyplot as plt + from matplotlib.animation import FuncAnimation + + files = os.listdir(path) + + nframes = len(files) + plt.subplots_adjust(top=1, bottom=0, left=0, right=1) + + def animate(i: int): + im = plt.imread(f"{path}/{str(i)}_{file_suffix}.png") + plt.imshow(im) + + anim = FuncAnimation( + plt.gcf(), animate, frames=nframes, interval=(2000.0 / nframes) + ) + anim.save(f"{path}/output.gif", writer="imagemagick") diff --git a/src/krisi/report/interactive.py b/src/krisi/report/interactive.py index 7d3ed9d8..e4d5180b 100644 --- a/src/krisi/report/interactive.py +++ b/src/krisi/report/interactive.py @@ -2,6 +2,7 @@ from krisi.evaluate.type import ScoreCardMetadata from krisi.report.type import InteractiveFigure, PlotlyInput +from krisi.utils.environment import is_notebook from krisi.utils.iterable_helpers import flatten if TYPE_CHECKING: @@ -170,4 +171,7 @@ def run_app( + [Input(input_id, "value") for input_id in component.global_input_ids], )(component.get_figure) - app.run(debug=True, threaded=True) + if is_notebook(): + app.run(mode="inline", debug=False, threaded=True) + else: + app.run(debug=False, threaded=True)