Skip to content

Commit

Permalink
Merge pull request #24 from HelmholtzAI-Consultants-Munich/23-decisio…
Browse files Browse the repository at this point in the history
…n-path-heatmap

23 decision path heatmap
  • Loading branch information
lisa-sousa authored Nov 19, 2024
2 parents 7b7b0cb + df5b24d commit 9153ca2
Show file tree
Hide file tree
Showing 9 changed files with 27,725 additions and 1,106 deletions.
2 changes: 1 addition & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ sphinxcontrib-youtube==1.2.0
nbsphinx==0.8.12
nbsphinx-link==1.3.0
pandoc==2.3
jinja2==3.1.2
jinja2
ipykernel
28 changes: 21 additions & 7 deletions fgclustering/forest_guided_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def __init__(
self.model_type = "regression"
print("Interpreting RandomForestRegressor")
elif "RandomForestClassifier" in str(type(model)):
self.model_type = "classifier"
self.model_type = "classification"
print("Interpreting RandomForestClassifier")
else:
raise ValueError(
Expand All @@ -82,7 +82,7 @@ def __init__(

def run(
self,
number_of_clusters: int = None,
k: int = None,
max_K: int = 8,
method_clustering: str = "pam",
init_clustering: str = "random",
Expand Down Expand Up @@ -155,7 +155,7 @@ def run(
and scores for each number of clusters. If set to 0, no output is printed. Defaults to 1.
:type verbose: {0, 1}, optional
"""
if number_of_clusters is None:
if k is None:
self.k = optimizer.optimizeK(
distance_matrix=self.distance_matrix,
y=self.y.to_numpy(),
Expand All @@ -178,7 +178,7 @@ def run(
print(f"Optimal number of cluster is: {self.k}")

else:
self.k = number_of_clusters
self.k = k
print(f"Use {self.k} as number of cluster")

self.cluster_labels = (
Expand Down Expand Up @@ -284,6 +284,7 @@ def plot_decision_paths(
self,
distributions: bool = True,
heatmap: bool = True,
heatmap_type: str = "static",
thr_pvalue: float = 1,
top_n: int = None,
num_cols: int = 6,
Expand Down Expand Up @@ -331,6 +332,19 @@ def plot_decision_paths(
)

if heatmap:
plotting._plot_heatmap(
self.data_clustering_ranked[selected_features], thr_pvalue, top_n, self.model_type, save
)
if self.model_type == "regression":
plotting._plot_heatmap_regression(
self.data_clustering_ranked[selected_features],
thr_pvalue,
top_n,
heatmap_type,
save,
)
elif self.model_type == "classification":
plotting._plot_heatmap_classification(
self.data_clustering_ranked[selected_features],
thr_pvalue,
top_n,
heatmap_type,
save,
)
55 changes: 16 additions & 39 deletions fgclustering/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@
############################################


def _compute_jaccard_matrix(
clusters, indices_bootstrap_clusters, indices_original_clusters
):
def _compute_jaccard_matrix(clusters, indices_bootstrap_clusters, indices_original_clusters):
"""Compute Jaccard Index between all possible cluster combinations of original vs bootstrapped clustering.
:param clusters: Clustering labels.
Expand Down Expand Up @@ -101,16 +99,13 @@ def _bootstrap_matrix(M):
) # Sort samples to increase speed. Does not affect downstream analysis because M is symmetric
M_bootstrapped = _get_bootstrap(M, bootstrapped_samples)
mapping_bootstrapped_indices_to_original_indices = {
bootstrapped: original
for bootstrapped, original in enumerate(bootstrapped_samples)
bootstrapped: original for bootstrapped, original in enumerate(bootstrapped_samples)
}

return M_bootstrapped, mapping_bootstrapped_indices_to_original_indices


def _translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(
labels, mapping=False
):
def _translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(labels, mapping=False):
"""Create dictionary that maps indices to cluster labels.
:param labels: Clustering labels.
Expand All @@ -136,9 +131,7 @@ def _translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(
return indices_clusters


def _compute_stability_indices(
distance_matrix, cluster_method, clusters, indices_original_clusters
):
def _compute_stability_indices(distance_matrix, cluster_method, clusters, indices_original_clusters):
"""Function that parallelizes the bootstrapping loop in the _compute_stability_indices function.
Compute stability of each cluster via Jaccard Index of original clustering vs clustering of one bootstraped sample.
Expand All @@ -162,15 +155,11 @@ def _compute_stability_indices(
bootstrapped_labels = cluster_method(bootstrapped_distance_matrix)

# now compute the indices for the different clusters
indices_bootstrap_clusters = (
_translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(
bootstrapped_labels,
mapping=mapping_bootstrapped_indices_to_original_indices,
)
)
jaccard_matrix = _compute_jaccard_matrix(
clusters, indices_bootstrap_clusters, indices_original_clusters
indices_bootstrap_clusters = _translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(
bootstrapped_labels,
mapping=mapping_bootstrapped_indices_to_original_indices,
)
jaccard_matrix = _compute_jaccard_matrix(clusters, indices_bootstrap_clusters, indices_original_clusters)

# compute optimal jaccard index for each cluster -> choose maximum possible jaccard index first
for cluster_round in range(len(jaccard_matrix)):
Expand All @@ -186,9 +175,7 @@ def _compute_stability_indices(
return index_per_cluster


def _compute_stability_indices_parallel(
distance_matrix, labels, cluster_method, bootstraps, n_jobs
):
def _compute_stability_indices_parallel(distance_matrix, labels, cluster_method, bootstraps, n_jobs):
"""Compute stability of each cluster via Jaccard Index of bootstraped vs original clustering.
:param distance_matrix: Proximity matrix of Random Forest model.
Expand All @@ -205,9 +192,7 @@ def _compute_stability_indices_parallel(
:rtype: dict
"""
clusters = np.unique(labels)
indices_original_clusters = (
_translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(labels)
)
indices_original_clusters = _translate_cluster_labels_to_dictionary_of_index_sets_per_cluster(labels)

# Compute Jaccard Index per bootstrapped sample
index_per_cluster = Parallel(n_jobs=n_jobs)(
Expand All @@ -217,13 +202,9 @@ def _compute_stability_indices_parallel(
for i in range(bootstraps)
)
# Sum Jaccard values of the same keys across dictionaries
index_per_cluster = dict(
functools.reduce(operator.add, map(collections.Counter, index_per_cluster))
)
index_per_cluster = dict(functools.reduce(operator.add, map(collections.Counter, index_per_cluster)))
# normalize:
index_per_cluster = {
cluster: index_per_cluster[cluster] / bootstraps for cluster in clusters
}
index_per_cluster = {cluster: index_per_cluster[cluster] / bootstraps for cluster in clusters}

return index_per_cluster

Expand Down Expand Up @@ -304,17 +285,13 @@ def optimizeK(
index_per_cluster = _compute_stability_indices_parallel(
distance_matrix, labels, cluster_method, bootstraps_JI, n_jobs
)
min_index = min(
[index_per_cluster[cluster] for cluster in index_per_cluster.keys()]
)
min_index = min([index_per_cluster[cluster] for cluster in index_per_cluster.keys()])

# only continue if jaccard indices are all larger than 0.6 (thus all clusters are stable)
# only continue if jaccard indices are all larger than discart_value_JI (thus all clusters are stable)
if not disable:
print(
"For number of cluster {} the Jaccard Index is {}".format(k, min_index)
)
print("For number of cluster {} the Jaccard Index is {}".format(k, min_index))
if min_index > discart_value_JI:
if model_type == "classifier":
if model_type == "classification":
# compute balanced purities
score = statistics.compute_balanced_average_impurity(y, labels)
elif model_type == "regression":
Expand Down
Loading

0 comments on commit 9153ca2

Please sign in to comment.