feature(Analysis): A new analysis module that explores features on …

…a rolling basis. `EDA` with finer insights (#151) * feature(Analysis): A new `analysis` module that explores features on a rolling basis. `EDA` with finer insights. * fix: Export. * fix: Type import didn't support python 3.7 * feature: Added plotly as a plotting dependency. * feature: Added networkx as dependency. * fix: Took out step size from default value so that it supports python 3.7 (pandas that support step needs python 3.8 or above). * feature: Support of rolling for older pandas. * feature: Docstring for mkdocs-gallery. * chore: Location of dataset module. * chore: This is not needed. * feature: Added option for notebook interactive.
dream-faster · May 2, 2023 · 6c02f72 · 6c02f72
1 parent a6aa738
commit 6c02f72
Show file tree

Hide file tree

Showing 11 changed files with 368 additions and 7 deletions.
diff --git a/docs/api/dataset.md b/docs/api/dataset.md
@@ -1 +1 @@
-::: krisi.evaluate.dataset
+::: krisi.analyse.dataset
diff --git a/docs/examples/analyse_rolling_corr.py b/docs/examples/analyse_rolling_corr.py
@@ -0,0 +1,71 @@
+"""
+Analysing rolling correlations
+===========================
+"""
+# mkdocs_gallery_thumbnail_path = 'images/example_thumnail.png'
+
+
+import os
+
+import numpy as np
+import pandas as pd
+
+from krisi.analyse.correlations import (
+    create_rolled_correlation_metrics,
+    create_summaries,
+    get_corr_rolled,
+)
+from krisi.report.graph import create_save_graphs
+
+measurement_per_hour = 4
+hours_per_day = 24
+days_per_week = 7
+one_day_measurement = measurement_per_hour * hours_per_day
+one_week_measurement = one_day_measurement * days_per_week
+
+window = one_day_measurement
+step = 1
+
+df = pd.read_csv(
+    "https://raw.githubusercontent.com/dream-faster/datasets/main/datasets/energy/industrial_pv_load.csv",
+    parse_dates=["datetime"],
+    index_col="datetime",
+)
+
+df_returns = df.pct_change().fillna(0)
+df_log_returns = pd.DataFrame(np.log(1 + df_returns))
+
+
+for name, df_ in [
+    # ("raw", df),
+    # ("returns", df_returns),
+    ("log_returns", df_log_returns),
+]:
+    save_location = f"output/analyse/correlations/{name}"
+    if not os.path.exists(save_location):
+        os.makedirs(save_location)
+
+    df_rolled_corr, df_rolled = get_corr_rolled(df_, window, step)
+    (
+        df_rolled,
+        df_rolled_mean,
+        df_rolled_mean_treshold,
+        coor_unrolled_in_time,
+        corr_std,
+        corr_mean,
+    ) = create_rolled_correlation_metrics(
+        df_rolled_corr, [df_.index[0] for df_ in df_rolled]
+    )
+
+    create_summaries(
+        corr_std,
+        corr_mean,
+        coor_unrolled_in_time,
+        window,
+        step,
+        name=name,
+        save_or_display=["display"],
+        save_location=save_location,
+    )
+
+    create_save_graphs([df_rolled_mean_treshold], save_location)
diff --git a/pyproject.toml b/pyproject.toml
@@ -83,7 +83,9 @@ plotting = [
   "jupyter_dash",
   "weasyprint",
   "kaleido",
-  "pangocffi"
+  "pangocffi",
+  "matplotlib",
+  "networkx"
 ]
 
 [tool.hatch.envs.quality]

diff --git a/src/krisi/analyse/__init__.py b/src/krisi/analyse/__init__.py
@@ -0,0 +1 @@
+from .dataset import check_consistency
diff --git a/src/krisi/analyse/correlations.py b/src/krisi/analyse/correlations.py
@@ -0,0 +1,107 @@
+from functools import reduce
+from typing import List, Optional, Tuple
+
+import matplotlib.pyplot as plt
+import pandas as pd
+from typing_extensions import Literal
+
+from .utils import corr_without_symmetry, unroll
+
+
+def get_unrolled_correlation_over_time(
+    df_rolled_corr: List[pd.DataFrame], start_indecies: List[pd.DatetimeIndex]
+) -> pd.DataFrame:
+    df_unrolled_in_time = pd.concat(
+        [unroll(df_, start_indecies[i]) for i, df_ in enumerate(df_rolled_corr)],
+        axis="columns",
+    ).T
+    df_unrolled_in_time.index = pd.to_datetime(df_unrolled_in_time.index)
+    return df_unrolled_in_time
+
+
+def get_mean_corr_matrix(df_rolled_corr: List[pd.DataFrame]) -> pd.DataFrame:
+    df_rolled_summed = reduce(lambda x, y: x.add(y, fill_value=0), df_rolled_corr)
+    df_rolled_mean = df_rolled_summed / len(df_rolled_corr)
+
+    return df_rolled_mean
+
+
+def get_corr_rolled(
+    df: pd.DataFrame, window: int, step: Optional[int] = None
+) -> Tuple[List[pd.DataFrame], List[pd.DataFrame]]:
+    if step is None or step == 1 or step == 0:
+        df_rolled = [df_ for df_ in df.rolling(window=window)][1:]
+    else:
+        df_rolled = [df_ for df_ in df.rolling(window=window, step=step)][1:]
+    df_rolled_corr = [corr_without_symmetry(df_) for df_ in df_rolled]
+
+    return df_rolled_corr, df_rolled
+
+
+def create_rolled_correlation_metrics(
+    df_rolled_corr: List[pd.DataFrame],
+    start_indecies: List[pd.DatetimeIndex],
+    threshold: float = 0.25,
+) -> Tuple[
+    List[pd.DataFrame], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.Series, pd.Series
+]:
+    coor_unrolled_in_time = get_unrolled_correlation_over_time(
+        df_rolled_corr, start_indecies
+    )
+
+    # Correlations properties on unrolled correlation data
+    corr_std = coor_unrolled_in_time.std()
+    corr_mean = coor_unrolled_in_time.mean()
+
+    # Mean correlations in a matrix for plotting into a network graph
+    mean_corr_over_time = get_mean_corr_matrix(df_rolled_corr)
+
+    mean_corr_over_time_treshold = mean_corr_over_time.where(
+        mean_corr_over_time >= threshold, other=0.0
+    )
+    return (
+        df_rolled_corr,
+        mean_corr_over_time,
+        mean_corr_over_time_treshold,
+        coor_unrolled_in_time,
+        corr_std,
+        corr_mean,
+    )
+
+
+def create_summaries(
+    corr_std: pd.Series,
+    corr_mean: pd.Series,
+    df_unrolled_in_time: pd.DataFrame,
+    window: int,
+    step: int,
+    name: str,
+    save_or_display: List[Literal["save", "display"]] = ["display"],
+    save_location: str = "structured_data/plots",
+    top_k: int = 2,
+) -> None:
+    fig, ax = plt.subplots(2, 1)
+    title = f"Summary of {name} | window - {window} ~ step - {step}"
+    fig.suptitle(title)
+
+    # ranking = corr_mean/corr_std
+
+    df_unrolled_in_time[corr_std.nsmallest(top_k).index.values].plot(
+        title=f"Correlation over time (for top-{top_k} smallest standard deviation)",
+        ax=ax[0],
+        xticks=[],
+    )
+    df_unrolled_in_time[corr_mean.nlargest(top_k).index.values].plot(
+        title=f"Correlation over time (for top-{top_k} largest mean)",
+        ax=ax[1],
+    )
+
+    result = pd.concat([corr_std, corr_mean], axis="columns", keys=["std", "mean"])
+    result.index.name = title
+    print(result)
+
+    if "save" in save_or_display:
+        path_friendly = title.replace(" ", "_")
+        plt.savefig(f"{save_location}/{path_friendly}.png", format="PNG")
+    if "display" in save_or_display:
+        plt.show()
diff --git a/src/krisi/evaluate/dataset.py → src/krisi/analyse/dataset.py b/src/krisi/evaluate/dataset.py → src/krisi/analyse/dataset.py
diff --git a/src/krisi/analyse/utils.py b/src/krisi/analyse/utils.py
@@ -0,0 +1,16 @@
+import numpy as np
+import pandas as pd
+
+
+def corr_without_symmetry(df: pd.DataFrame) -> pd.DataFrame:
+    corr = df.corr()
+    return corr.mask(np.tril(np.ones(corr.shape)).astype(np.bool_))
+
+
+def unroll(df: pd.DataFrame, datetime: pd.DatetimeIndex) -> pd.Series:
+    df = df.melt(ignore_index=False).dropna()
+
+    df.index = df.index + "_" + df.variable
+    df = df.drop(columns=["variable"])
+    df = df.rename(columns={"value": datetime})
+    return df.squeeze()
diff --git a/src/krisi/evaluate/__init__.py b/src/krisi/evaluate/__init__.py
@@ -1,5 +1,4 @@
 from .compare import compare
-from .dataset import check_consistency
 from .metric import Metric
 from .score import score, score_in_outsample
 from .scorecard import ScoreCard

diff --git a/src/krisi/evaluate/scorecard.py b/src/krisi/evaluate/scorecard.py
@@ -152,9 +152,7 @@ def __init__(
         self.__dict__["predictions"] = convert_to_series(predictions, "predictions")
         self.__dict__["sample_type"] = sample_type
         self.__dict__["rolling_args"] = (
-            rolling_args
-            if rolling_args is not None
-            else dict(window=len(y) // 100, step=len(y) // 100)
+            rolling_args if rolling_args is not None else dict(window=len(y) // 100)
         )
         self.__dict__["classification"] = (
             is_dataset_classification_like(y)

diff --git a/src/krisi/report/graph.py b/src/krisi/report/graph.py
@@ -0,0 +1,163 @@
+from typing import List
+
+import matplotlib.pyplot as plt
+import networkx as nx
+import numpy as np
+import pandas as pd
+from typing_extensions import Literal
+
+
+def create_save_graphs(
+    df_rolled_corr: List[pd.DataFrame],
+    save_or_display: List[Literal["save", "display"]] = ["save", "display"],
+    corr_direction: Literal["positive", "negative"] = "positive",
+    min_correlation: float = 0.01,
+    save_location: str = "output/analyse/correlations",
+) -> None:
+    for i, corr_df in enumerate(df_rolled_corr):
+        __display_corr_graph(
+            corr_df,
+            save_or_display,
+            corr_direction,
+            min_correlation,
+            save_location,
+            file_name=str(i),
+        )
+
+
+def __display_corr_graph(
+    corr: pd.DataFrame,
+    save_or_display: List[Literal["save", "display"]],
+    corr_direction: Literal["positive", "negative"],
+    min_correlation: float,
+    save_location: str,
+    file_name: str = "0",
+) -> None:
+    feature_node_names = corr.index.values
+    matrix_array = np.array(corr)
+    matrix_corr = np.matrix(matrix_array, copy=False, dtype=None)
+
+    G = nx.Graph(matrix_corr)
+    G = nx.relabel_nodes(G, lambda i: feature_node_names[i])
+    G.remove_edges_from(nx.selfloop_edges(G))
+
+    __create_corr_network(
+        G,
+        corr_direction=corr_direction,
+        min_correlation=min_correlation,
+        save_location=save_location,
+        file_name=file_name,
+        save_or_display=save_or_display,
+    )
+
+
+def __create_corr_network(
+    G: nx.Graph,
+    corr_direction: Literal["positive", "negative"],
+    min_correlation: float,
+    save_location: str,
+    file_name: str,
+    save_or_display: List[Literal["save", "display"]],
+):
+    # Creates a copy of the graph
+    H = G.copy()
+
+    # Checks all the edges and removes some based on corr_direction
+    for stock1, stock2, weight in G.edges(data=True):
+        # if we only want to see the positive correlations we then delete the edges with weight smaller than 0
+        if corr_direction == "positive":
+            # it adds a minimum value for correlation.
+            # If correlation weaker than the min, then it deletes the edge
+            if weight["weight"] < 0 or weight["weight"] < min_correlation:
+                H.remove_edge(stock1, stock2)
+        # this part runs if the corr_direction is negative and removes edges with weights equal or largen than 0
+        else:
+            # it adds a minimum value for correlation.
+            # If correlation weaker than the min, then it deletes the edge
+            if weight["weight"] >= 0 or weight["weight"] > min_correlation:
+                H.remove_edge(stock1, stock2)
+
+    # crates a list for edges and for the weights
+    edges, weights = zip(*nx.get_edge_attributes(H, "weight").items())
+
+    # increases the value of weights, so that they are more visible in the graph
+    weights = tuple([(1 + abs(x)) ** 2 for x in weights])
+
+    # calculates the degree of each node
+    # d = nx.degree(H)
+    # creates list of nodes and a list their degrees that will be used later for their sizes
+    # nodelist, node_sizes = zip(*d.items())
+
+    # positions
+    positions = nx.circular_layout(H)
+
+    # Figure size
+    plt.figure(figsize=(15, 15))
+
+    # draws nodes
+    nx.draw_networkx_nodes(
+        H,
+        positions,
+        node_color="#DA70D6",
+        # nodelist=nodelist,
+        # the node size will be now based on its degree
+        node_size=5000,
+        # node_size=tuple([x**3 for x in node_sizes]),
+        alpha=0.8,
+    )
+
+    # Styling for labels
+    nx.draw_networkx_labels(H, positions, font_size=21, font_family="sans-serif")
+
+    # edge colors based on weight direction
+    if corr_direction == "positive":
+        edge_colour = plt.cm.GnBu
+    else:
+        edge_colour = plt.cm.PuRd
+
+    # draws the edges
+    nx.draw_networkx_edges(
+        H,
+        positions,
+        edgelist=edges,
+        style="solid",
+        # adds width=weights and edge_color = weights
+        # so that edges are based on the weight parameter
+        # edge_cmap is for the color scale based on the weight
+        # edge_vmin and edge_vmax assign the min and max weights for the width
+        width=weights,
+        edge_color=weights,
+        edge_cmap=edge_colour,
+        edge_vmin=min(weights),
+        edge_vmax=max(weights),
+    )
+
+    plt.axis("off")
+
+    if "save" in save_or_display:
+        plt.savefig(f"{save_location}/{file_name}_{corr_direction}.png", format="PNG")
+    if "display" in save_or_display:
+        plt.show()
+
+
+def create_animation(
+    path: str = "structured_data/plots", file_suffix: str = "positive"
+):
+    import os
+
+    import matplotlib.pyplot as plt
+    from matplotlib.animation import FuncAnimation
+
+    files = os.listdir(path)
+
+    nframes = len(files)
+    plt.subplots_adjust(top=1, bottom=0, left=0, right=1)
+
+    def animate(i: int):
+        im = plt.imread(f"{path}/{str(i)}_{file_suffix}.png")
+        plt.imshow(im)
+
+    anim = FuncAnimation(
+        plt.gcf(), animate, frames=nframes, interval=(2000.0 / nframes)
+    )
+    anim.save(f"{path}/output.gif", writer="imagemagick")
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		::: krisi.evaluate.dataset
		::: krisi.analyse.dataset