Refactor AAclust backend. Merge .Plot into . (e.g., CPPPlot into CPP).

breimanntools · Oct 7, 2023 · f38cd9b · f38cd9b
1 parent 4615df3
commit f38cd9b
Show file tree

Hide file tree

Showing 59 changed files with 132 additions and 132 deletions.
diff --git a/aaanalysis/__pycache__/utils.cpython-39.pyc b/aaanalysis/__pycache__/utils.cpython-39.pyc
diff --git a/aaanalysis/feature_engineering/__pycache__/_aaclust.cpython-39.pyc b/aaanalysis/feature_engineering/__pycache__/_aaclust.cpython-39.pyc
diff --git a/aaanalysis/feature_engineering/__pycache__/_aaclust_plot.cpython-39.pyc b/aaanalysis/feature_engineering/__pycache__/_aaclust_plot.cpython-39.pyc
diff --git a/aaanalysis/feature_engineering/__pycache__/_cpp.cpython-39.pyc b/aaanalysis/feature_engineering/__pycache__/_cpp.cpython-39.pyc
diff --git a/aaanalysis/feature_engineering/__pycache__/_cpp_plot.cpython-39.pyc b/aaanalysis/feature_engineering/__pycache__/_cpp_plot.cpython-39.pyc
diff --git a/aaanalysis/feature_engineering/_aaclust.py b/aaanalysis/feature_engineering/_aaclust.py
@@ -1,10 +1,9 @@
 """
-This is a script for the AAclust clustering wrapper method.
+This is a script for the interface of the AAclust class, used for clustering wrapper method.
 """
 import numpy as np
 from typing import Optional, Dict, Union, List, Tuple, Type
 from sklearn.cluster import KMeans
-from sklearn.metrics import silhouette_score, calinski_harabasz_score
 from sklearn.base import ClusterMixin
 from sklearn.exceptions import ConvergenceWarning
 import warnings
@@ -15,8 +14,9 @@
 import aaanalysis.utils as ut
 
 from ._backend.aaclust.aaclust_fit import estimate_lower_bound_n_clusters, optimize_n_clusters, merge_clusters
-from ._backend.aaclust.aaclust_eval import bic_score
-from ._backend.aaclust.aaclust_methods import (compute_centers, compute_medoids,
+from ._backend.aaclust.aaclust_eval import evaluate_clustering
+from ._backend.aaclust.aaclust_methods import (compute_centers,
+                                               compute_medoids,
                                                name_clusters,
                                                compute_correlation)
 
@@ -132,7 +132,7 @@ class AAclust(Wrapper):
     def __init__(self,
                  model_class: Type[ClusterMixin] = KMeans,
                  model_kwargs: Optional[Dict] = None,
-                 verbose: bool = False):
+                 verbose: Optional[bool] = None):
         # Model parameters
         model_class = ut.check_mode_class(model_class=model_class)
         if model_kwargs is None and model_class is KMeans:
@@ -310,7 +310,8 @@ def eval(self,
 
         Notes
         -----
-        BIC was modified to align with the SC and CH, so that higher values signify better clustering
+        BIC was adapted form this `StackExchange discussion <https://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans`
+        and modified to align with the SC and CH score so that higher values signify better clustering,
         contrary to conventional BIC implementation favoring lower values. See [Breimann23a]_.
 
         See Also
@@ -326,21 +327,18 @@ def eval(self,
         labels = ut.check_labels(labels=labels)
         ut.check_match_X_labels(X=X, labels=labels)
 
-        # Number of clusters (number of medoids)
+        # Get number of clusters (number of medoids) and evaluation measures
         n_clusters = len(set(labels))
-        # Bayesian Information Criterion
-        BIC = bic_score(X, labels)
-        # Calinski-Harabasz Index
-        CH = calinski_harabasz_score(X, labels)
-        if np.isnan(CH):
-            CH = 0
-            warnings.warn("CH was set to 0 because sklearn.metric.calinski_harabasz_score returned NaN.", RuntimeWarning)
-        # Silhouette Coefficient
-        SC = silhouette_score(X, labels)
-        if np.isnan(SC):
-            SC = -1
-            warnings.warn("SC was set to -1 because sklearn.metric.silhouette_score returned NaN.", RuntimeWarning)
-        return n_clusters, BIC, CH, SC
+        bic, ch, sc = evaluate_clustering(X, labels=labels)
+        if np.isnan(ch):
+            ch = 0
+            if self._verbose:
+                warnings.warn("CH was set to 0 because sklearn.metric.calinski_harabasz_score returned NaN.", RuntimeWarning)
+        if np.isnan(sc):
+            sc = -1
+            if self._verbose:
+                warnings.warn("SC was set to -1 because sklearn.metric.silhouette_score returned NaN.", RuntimeWarning)
+        return n_clusters, bic, ch, sc
 
     @staticmethod
     def name_clusters(X: ut.ArrayLike2D,

diff --git a/aaanalysis/feature_engineering/_aaclust_plot.py b/aaanalysis/feature_engineering/_aaclust_plot.py
@@ -1,5 +1,5 @@
 """
-This is a script for the plotting class of AAclust.
+This is a script for the interface of the AAclustPlot class, used for plotting the results of AAclust.
 """
 from sklearn.decomposition import PCA
 from typing import Optional, Dict, Union, List, Tuple, Type
@@ -8,7 +8,7 @@
 import aaanalysis as aa
 import aaanalysis.utils as ut
 
-from ._backend.aaclust_plot.aaclust_plot_eval import plot_eval
+from ._backend.aaclust.aaclust_plot import plot_eval
 
 
 # I Helper Functions

diff --git a/aaanalysis/feature_engineering/_backend/aaclust/__pycache__/_utils_aaclust.cpython-39.pyc b/aaanalysis/feature_engineering/_backend/aaclust/__pycache__/_utils_aaclust.cpython-39.pyc
diff --git a/aaanalysis/feature_engineering/_backend/aaclust/__pycache__/aaclust_eval.cpython-39.pyc b/aaanalysis/feature_engineering/_backend/aaclust/__pycache__/aaclust_eval.cpython-39.pyc
diff --git a/aaanalysis/feature_engineering/_backend/aaclust/__pycache__/aaclust_fit.cpython-39.pyc b/aaanalysis/feature_engineering/_backend/aaclust/__pycache__/aaclust_fit.cpython-39.pyc
diff --git a/aaanalysis/feature_engineering/_backend/aaclust/__pycache__/aaclust_methods.cpython-39.pyc b/aaanalysis/feature_engineering/_backend/aaclust/__pycache__/aaclust_methods.cpython-39.pyc
diff --git a/aaanalysis/feature_engineering/_backend/aaclust/__pycache__/aaclust_plot.cpython-39.pyc b/aaanalysis/feature_engineering/_backend/aaclust/__pycache__/aaclust_plot.cpython-39.pyc
diff --git a/aaanalysis/feature_engineering/_backend/aaclust/_utils_aaclust.py b/aaanalysis/feature_engineering/_backend/aaclust/_utils_aaclust.py
@@ -1,7 +1,8 @@
 """
-This is a script for utility functions for aaclust object.
+This is a script for utility functions for AAclust object and backend .
 """
 import numpy as np
+from collections import OrderedDict
 
 
 # II Main Functions
@@ -19,5 +20,13 @@ def _cluster_medoid(X):
     return ind_max
 
 
+def _compute_centers(X, labels=None):
+    """Obtain cluster centers and their labels"""
+    center_labels = list(OrderedDict.fromkeys(labels))
+    list_masks = [[True if i == label else False for i in labels] for label in center_labels]
+    centers = np.concatenate([_cluster_center(X[mask]) for mask in list_masks]).round(3)
+    return centers, np.array(center_labels)
+
+
 
 
diff --git a/aaanalysis/feature_engineering/_backend/aaclust/aaclust_eval.py b/aaanalysis/feature_engineering/_backend/aaclust/aaclust_eval.py
@@ -1,27 +1,18 @@
 """
-This is a script for computing the Bayesian Information Criterion (BIC) used in the AAclust.eval() method.
+This is a script for the backend of the AAclust.eval method.
 """
 import numpy as np
 from scipy.spatial import distance
+from sklearn.metrics import silhouette_score, calinski_harabasz_score
 
-from .aaclust_methods import compute_centers
+from ._utils_aaclust import _compute_centers
 
 # I Helper Functions
 
 
 # II Main function
 def bic_score(X, labels=None):
-    """Computes the BIC metric for given clusters.
-
-    Returns
-    -------
-    BIC : float
-        BIC value between -inf and inf. Greater values indicate better clustering.
-
-    See also
-    --------
-    https://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans
-    """
+    """Computes the Bayesian Information Criterion (BIC) metric for given clusters."""
     epsilon = 1e-10 # prevent division by zero
 
     # Check if labels match to number of clusters
@@ -35,7 +26,7 @@ def bic_score(X, labels=None):
     # Map labels to increasing order starting with 0
     unique_labels, inverse = np.unique(labels, return_inverse=True)
     labels = inverse
-    centers, center_labels = compute_centers(X, labels=labels)
+    centers, center_labels = _compute_centers(X, labels=labels)
     size_clusters = np.bincount(labels)
 
     # Compute variance over all clusters
@@ -55,4 +46,15 @@ def bic_score(X, labels=None):
 
     bic_components = size_clusters * (log_size_clusters - log_n_samples) - 0.5 * size_clusters * n_features * log_bcv - 0.5 * (size_clusters - 1) * n_features
     bic = np.sum(bic_components) - const_term
-    return bic
+    return bic
+
+
+def evaluate_clustering(X, labels=None):
+    """Evaluate clustering results using BIC, CH, SC scores"""
+    # Bayesian Information Criterion
+    bic = bic_score(X, labels)
+    # Calinski-Harabasz Index
+    ch = calinski_harabasz_score(X, labels)
+    # Silhouette Coefficient
+    sc = silhouette_score(X, labels)
+    return bic, ch, sc
diff --git a/aaanalysis/feature_engineering/_backend/aaclust/aaclust_fit.py b/aaanalysis/feature_engineering/_backend/aaclust/aaclust_fit.py
@@ -1,7 +1,8 @@
 """
-This is a script for three main steps of the AAclust algorithm and further helper functions.
+This is a script for the backend of the AAclust.fit() method.
+
+The fit functions performs the AAclust algorithm consisting of three steps:
 
-AAclust algorithm steps
 1. Estimate lower bound for n_clusters
 2. Optimization of n_clusters
 3. Merge clusters
@@ -17,7 +18,7 @@
 
 # I Helper Functions
 
-
+# TODO simplify documenation (DRY)
 # Compute minimum correlation on center or all scales
 def min_cor_center(X):
     """Get minimum for correlation of all columns with cluster center, defined as the mean values

diff --git a/aaanalysis/feature_engineering/_backend/aaclust/aaclust_methods.py b/aaanalysis/feature_engineering/_backend/aaclust/aaclust_methods.py
@@ -1,25 +1,23 @@
 """
-This is a script for the AAclust methods.
+This is a script for the backend of various AAclust methods.
 """
 import pandas as pd
 import numpy as np
 from collections import OrderedDict
 
 import aaanalysis.utils as ut
-from ._utils_aaclust import _cluster_center, _cluster_medoid
+from ._utils_aaclust import _cluster_medoid, _compute_centers
 
-# Settings
 
 # I Helper function
 
 
 # II Main Functions
 def compute_centers(X, labels=None):
     """Obtain cluster centers and their labels"""
-    center_labels = list(OrderedDict.fromkeys(labels))
-    list_masks = [[True if i == label else False for i in labels] for label in center_labels]
-    centers = np.concatenate([_cluster_center(X[mask]) for mask in list_masks]).round(3)
-    return centers, np.array(center_labels)
+    # Function in utilis for not breaking dependency rules:
+    # Backend functions should only depend on backend utility functions
+    return _compute_centers(X, labels=labels)
 
 
 def compute_medoids(X, labels=None):

diff --git a/...kend/aaclust_plot/aaclust_plot_methods.py → ...ineering/_backend/aaclust/aaclust_plot.py b/...kend/aaclust_plot/aaclust_plot_methods.py → ...ineering/_backend/aaclust/aaclust_plot.py
@@ -1,24 +1,51 @@
 """
-This is a script for ...
+This is a script for the AAclust plot_eval method.
 """
-import warnings
-
+import time
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
-import math
-import matplotlib as mpl
-
-import aaanalysis._utils as ut
+import numpy as np
 
-# Settings
-pd.set_option('expand_frame_repr', False)  # Single line print for pd.Dataframe
+import aaanalysis.utils as ut
 
 
 # I Helper Functions
-
+def _get_rank(data):
+    """"""
+    _df = data.copy()
+    _df['BIC_rank'] = _df['BIC'].rank(ascending=False)
+    _df['CH_rank'] = _df['CH'].rank(ascending=False)
+    _df['SC_rank'] = _df['SC'].rank(ascending=False)
+    return _df[['BIC_rank', 'CH_rank', 'SC_rank']].mean(axis=1).round(2)
 
 # II Main Functions
+def plot_eval(data=None, names=None, dict_xlims=None, figsize=None, columns=None, colors=None):
+    """"""
+    data = pd.DataFrame(data, columns=columns, index=names)
+    data["rank"] = _get_rank(data)
+    data = data.sort_values(by="rank", ascending=True)
+    # Plotting
+    fig, axes = plt.subplots(1, 4, sharey=True, figsize=figsize)
+    for i, col in enumerate(columns):
+        ax = axes[i]
+        sns.barplot(ax=ax, data=data, y=data.index, x=col, color=colors[i])
+        # Customize subplots
+        ax.set_ylabel("")
+        ax.set_xlabel(col)
+        ax.axvline(0, color='black')  # , linewidth=aa.plot_gcfs("axes.linewidth"))
+        if dict_xlims and col in dict_xlims:
+            ax.set_xlim(dict_xlims[col])
+        if i == 0:
+            ax.set_title("Number of clusters", weight="bold")
+        elif i == 2:
+            ax.set_title("Quality measures", weight="bold")
+        sns.despine(ax=ax, left=True)
+        ax.tick_params(axis='y', which='both', left=False)
+    plt.tight_layout()
+    plt.subplots_adjust(wspace=0.25, hspace=0)
+    return fig, axes
+
 def _plot_pca(df_pred=None, filter_classes=None, x=None, y=None,  others=True, highlight_rel=True,
               figsize=(6, 6), highlight_mean=True, list_classes=None):
     """"""

diff --git a/aaanalysis/feature_engineering/_backend/aaclust_plot/__init__.py b/aaanalysis/feature_engineering/_backend/aaclust_plot/__init__.py
diff --git a/aaanalysis/feature_engineering/_backend/aaclust_plot/__pycache__/__init__.cpython-39.pyc b/aaanalysis/feature_engineering/_backend/aaclust_plot/__pycache__/__init__.cpython-39.pyc
diff --git a/...is/feature_engineering/_backend/aaclust_plot/__pycache__/aaclust_plot_eval.cpython-39.pyc b/...is/feature_engineering/_backend/aaclust_plot/__pycache__/aaclust_plot_eval.cpython-39.pyc
diff --git a/aaanalysis/feature_engineering/_backend/aaclust_plot/aaclust_plot_eval.py b/aaanalysis/feature_engineering/_backend/aaclust_plot/aaclust_plot_eval.py
diff --git a/..._pycache__/cpp_plot_method.cpython-39.pyc → ...d/cpp/__pycache__/cpp_plot.cpython-39.pyc b/..._pycache__/cpp_plot_method.cpython-39.pyc → ...d/cpp/__pycache__/cpp_plot.cpython-39.pyc
diff --git a/...ring/_backend/cpp_plot/cpp_plot_method.py → ...ture_engineering/_backend/cpp/cpp_plot.py b/...ring/_backend/cpp_plot/cpp_plot_method.py → ...ture_engineering/_backend/cpp/cpp_plot.py
diff --git a/aaanalysis/feature_engineering/_backend/cpp_plot/__init__.py b/aaanalysis/feature_engineering/_backend/cpp_plot/__init__.py
diff --git a/aaanalysis/feature_engineering/_backend/cpp_plot/__pycache__/__init__.cpython-39.pyc b/aaanalysis/feature_engineering/_backend/cpp_plot/__pycache__/__init__.cpython-39.pyc
diff --git a/aaanalysis/feature_engineering/_cpp.py b/aaanalysis/feature_engineering/_cpp.py
@@ -54,8 +54,13 @@ class CPP(Tool):
     The CPP.run() method performs all steps of the CPP algorithm.
 
     """
-    def __init__(self, df_scales=None, df_cat=None, df_parts=None, split_kws=None,
-                 accept_gaps=False, verbose=True):
+    def __init__(self,
+                 df_scales=None,
+                 df_cat=None,
+                 df_parts=None,
+                 split_kws=None,
+                 accept_gaps=False,
+                 verbose=None):
         # Load default scales if not specified
         sf = SequenceFeature()
         if df_cat is None:

diff --git a/aaanalysis/feature_engineering/_cpp_plot.py b/aaanalysis/feature_engineering/_cpp_plot.py
@@ -9,7 +9,7 @@
 import aaanalysis as aa
 import aaanalysis.utils as ut
 
-from ._backend.cpp_plot.cpp_plot_method import CPPPlots, get_optimal_fontsize
+from ._backend.cpp.cpp_plot import CPPPlots, get_optimal_fontsize
 
 
 # Settings

diff --git a/aaanalysis/pu_learning/_backend/dpulearn_plot/__init__.py b/aaanalysis/pu_learning/_backend/dpulearn_plot/__init__.py
diff --git a/aaanalysis/utils.py b/aaanalysis/utils.py
@@ -164,14 +164,13 @@ def read_csv_cached(name, sep=None):
 
 # Main check functions
 def check_verbose(verbose):
-    check_bool(name="verbose", val=verbose)
-    # System level verbosity
-    verbose_value = options['verbose']
-    # If system level is negative return
-    if not verbose_value:
-        return verbose_value
+    if verbose is None:
+        # System level verbosity
+        verbose = options['verbose']
     else:
-        return verbose
+        check_bool(name="verbose", val=verbose)
+    return verbose
+
 
 # TODO check each of this checking function (make simpler)
 # Check key dataframes using constants and general checking functions

diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle
diff --git a/docs/build/doctrees/generated/aaanalysis.AAclust.doctree b/docs/build/doctrees/generated/aaanalysis.AAclust.doctree
diff --git a/docs/build/doctrees/generated/aaanalysis.CPP.doctree b/docs/build/doctrees/generated/aaanalysis.CPP.doctree
diff --git a/docs/build/html/_downloads/004048c0cbb6684bdb9047282ab71735/aaanalysis-plot_settings-2.pdf b/docs/build/html/_downloads/004048c0cbb6684bdb9047282ab71735/aaanalysis-plot_settings-2.pdf
diff --git a/docs/build/html/_downloads/163aacac4bd235c9af7a62d7b4d0c89f/aaanalysis-plot_get_cdict-1.pdf b/docs/build/html/_downloads/163aacac4bd235c9af7a62d7b4d0c89f/aaanalysis-plot_get_cdict-1.pdf
diff --git a/docs/build/html/_downloads/1f3abea1675a65bb341756c52c9927f4/aaanalysis-plot_gcfs-1.pdf b/docs/build/html/_downloads/1f3abea1675a65bb341756c52c9927f4/aaanalysis-plot_gcfs-1.pdf
diff --git a/docs/build/html/_downloads/72c2e4be500ecf10c85a4e6f81c365fc/aaanalysis-plot_legend-1.pdf b/docs/build/html/_downloads/72c2e4be500ecf10c85a4e6f81c365fc/aaanalysis-plot_legend-1.pdf
diff --git a/docs/build/html/_downloads/795a736e5af756908120c8bda412fd28/aaanalysis-plot_get_cmap-1.pdf b/docs/build/html/_downloads/795a736e5af756908120c8bda412fd28/aaanalysis-plot_get_cmap-1.pdf
diff --git a/docs/build/html/_downloads/88d7f3f7cb5a284c0bfaa377fb4ce1d8/aaanalysis-plot_get_clist-1.pdf b/docs/build/html/_downloads/88d7f3f7cb5a284c0bfaa377fb4ce1d8/aaanalysis-plot_get_clist-1.pdf
diff --git a/docs/build/html/_downloads/fb29bffb69140db4f68c4eb913c6f7d3/aaanalysis-plot_settings-1.pdf b/docs/build/html/_downloads/fb29bffb69140db4f68c4eb913c6f7d3/aaanalysis-plot_settings-1.pdf