Merge pull request #24 from HelmholtzAI-Consultants-Munich/23-decisio…

…n-path-heatmap 23 decision path heatmap
HelmholtzAI-Consultants-Munich · Nov 19, 2024 · 9153ca2 · 9153ca2
2 parents 7b7b0cb + df5b24d
commit 9153ca2
Show file tree

Hide file tree

Showing 9 changed files with 27,725 additions and 1,106 deletions.
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -6,5 +6,5 @@ sphinxcontrib-youtube==1.2.0
 nbsphinx==0.8.12
 nbsphinx-link==1.3.0
 pandoc==2.3
-jinja2==3.1.2
+jinja2
 ipykernel
diff --git a/fgclustering/forest_guided_clustering.py b/fgclustering/forest_guided_clustering.py
@@ -58,7 +58,7 @@ def __init__(
             self.model_type = "regression"
             print("Interpreting RandomForestRegressor")
         elif "RandomForestClassifier" in str(type(model)):
-            self.model_type = "classifier"
+            self.model_type = "classification"
             print("Interpreting RandomForestClassifier")
         else:
             raise ValueError(
@@ -82,7 +82,7 @@ def __init__(
 
     def run(
         self,
-        number_of_clusters: int = None,
+        k: int = None,
         max_K: int = 8,
         method_clustering: str = "pam",
         init_clustering: str = "random",
@@ -155,7 +155,7 @@ def run(
                         and scores for each number of clusters. If set to 0, no output is printed. Defaults to 1.
         :type verbose: {0, 1}, optional
         """
-        if number_of_clusters is None:
+        if k is None:
             self.k = optimizer.optimizeK(
                 distance_matrix=self.distance_matrix,
                 y=self.y.to_numpy(),
@@ -178,7 +178,7 @@ def run(
             print(f"Optimal number of cluster is: {self.k}")
 
         else:
-            self.k = number_of_clusters
+            self.k = k
             print(f"Use {self.k} as number of cluster")
 
         self.cluster_labels = (
@@ -284,6 +284,7 @@ def plot_decision_paths(
         self,
         distributions: bool = True,
         heatmap: bool = True,
+        heatmap_type: str = "static",
         thr_pvalue: float = 1,
         top_n: int = None,
         num_cols: int = 6,
@@ -331,6 +332,19 @@ def plot_decision_paths(
             )
 
         if heatmap:
-            plotting._plot_heatmap(
-                self.data_clustering_ranked[selected_features], thr_pvalue, top_n, self.model_type, save
-            )
+            if self.model_type == "regression":
+                plotting._plot_heatmap_regression(
+                    self.data_clustering_ranked[selected_features],
+                    thr_pvalue,
+                    top_n,
+                    heatmap_type,
+                    save,
+                )
+            elif self.model_type == "classification":
+                plotting._plot_heatmap_classification(
+                    self.data_clustering_ranked[selected_features],
+                    thr_pvalue,
+                    top_n,
+                    heatmap_type,
+                    save,
+                )
diff --git a/fgclustering/optimizer.py b/fgclustering/optimizer.py
@@ -20,9 +20,7 @@
 ############################################
 
 
-def _compute_jaccard_matrix(
-    clusters, indices_bootstrap_clusters, indices_original_clusters
-):
+def _compute_jaccard_matrix(clusters, indices_bootstrap_clusters, indices_original_clusters):
     """Compute Jaccard Index between all possible cluster combinations of original vs bootstrapped clustering.
 
     :param clusters: Clustering labels.
@@ -101,16 +99,13 @@ def _bootstrap_matrix(M):
     )  # Sort samples to increase speed. Does not affect downstream analysis because M is symmetric
     M_bootstrapped = _get_bootstrap(M, bootstrapped_samples)
     mapping_bootstrapped_indices_to_original_indices = {
-        bootstrapped: original
-        for bootstrapped, original in enumerate(bootstrapped_samples)
+        bootstrapped: original for bootstrapped, original in enumerate(bootstrapped_samples)
     }
 
     return M_bootstrapped, mapping_bootstrapped_indices_to_original_indices
 
 
-def _translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(
-    labels, mapping=False
-):
+def _translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(labels, mapping=False):
     """Create dictionary that maps indices to cluster labels.
 
     :param labels: Clustering labels.
@@ -136,9 +131,7 @@ def _translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(
     return indices_clusters
 
 
-def _compute_stability_indices(
-    distance_matrix, cluster_method, clusters, indices_original_clusters
-):
+def _compute_stability_indices(distance_matrix, cluster_method, clusters, indices_original_clusters):
     """Function that parallelizes the bootstrapping loop in the _compute_stability_indices function.
     Compute stability of each cluster via Jaccard Index of original clustering vs clustering of one bootstraped sample.
 
@@ -162,15 +155,11 @@ def _compute_stability_indices(
     bootstrapped_labels = cluster_method(bootstrapped_distance_matrix)
 
     # now compute the indices for the different clusters
-    indices_bootstrap_clusters = (
-        _translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(
-            bootstrapped_labels,
-            mapping=mapping_bootstrapped_indices_to_original_indices,
-        )
-    )
-    jaccard_matrix = _compute_jaccard_matrix(
-        clusters, indices_bootstrap_clusters, indices_original_clusters
+    indices_bootstrap_clusters = _translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(
+        bootstrapped_labels,
+        mapping=mapping_bootstrapped_indices_to_original_indices,
     )
+    jaccard_matrix = _compute_jaccard_matrix(clusters, indices_bootstrap_clusters, indices_original_clusters)
 
     # compute optimal jaccard index for each cluster -> choose maximum possible jaccard index first
     for cluster_round in range(len(jaccard_matrix)):
@@ -186,9 +175,7 @@ def _compute_stability_indices(
     return index_per_cluster
 
 
-def _compute_stability_indices_parallel(
-    distance_matrix, labels, cluster_method, bootstraps, n_jobs
-):
+def _compute_stability_indices_parallel(distance_matrix, labels, cluster_method, bootstraps, n_jobs):
     """Compute stability of each cluster via Jaccard Index of bootstraped vs original clustering.
 
     :param distance_matrix: Proximity matrix of Random Forest model.
@@ -205,9 +192,7 @@ def _compute_stability_indices_parallel(
     :rtype: dict
     """
     clusters = np.unique(labels)
-    indices_original_clusters = (
-        _translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(labels)
-    )
+    indices_original_clusters = _translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(labels)
 
     # Compute Jaccard Index per bootstrapped sample
     index_per_cluster = Parallel(n_jobs=n_jobs)(
@@ -217,13 +202,9 @@ def _compute_stability_indices_parallel(
         for i in range(bootstraps)
     )
     # Sum Jaccard values of the same keys across dictionaries
-    index_per_cluster = dict(
-        functools.reduce(operator.add, map(collections.Counter, index_per_cluster))
-    )
+    index_per_cluster = dict(functools.reduce(operator.add, map(collections.Counter, index_per_cluster)))
     # normalize:
-    index_per_cluster = {
-        cluster: index_per_cluster[cluster] / bootstraps for cluster in clusters
-    }
+    index_per_cluster = {cluster: index_per_cluster[cluster] / bootstraps for cluster in clusters}
 
     return index_per_cluster
 
@@ -304,17 +285,13 @@ def optimizeK(
         index_per_cluster = _compute_stability_indices_parallel(
             distance_matrix, labels, cluster_method, bootstraps_JI, n_jobs
         )
-        min_index = min(
-            [index_per_cluster[cluster] for cluster in index_per_cluster.keys()]
-        )
+        min_index = min([index_per_cluster[cluster] for cluster in index_per_cluster.keys()])
 
-        # only continue if jaccard indices are all larger than 0.6 (thus all clusters are stable)
+        # only continue if jaccard indices are all larger than discart_value_JI (thus all clusters are stable)
         if not disable:
-            print(
-                "For number of cluster {} the Jaccard Index is {}".format(k, min_index)
-            )
+            print("For number of cluster {} the Jaccard Index is {}".format(k, min_index))
         if min_index > discart_value_JI:
-            if model_type == "classifier":
+            if model_type == "classification":
                 # compute balanced purities
                 score = statistics.compute_balanced_average_impurity(y, labels)
             elif model_type == "regression":