Skip to content

Commit

Permalink
Refactor AAclust backend. Merge .Plot into . (e.g., CPPPlot into CPP).
Browse files Browse the repository at this point in the history
  • Loading branch information
breimanntools committed Oct 7, 2023
1 parent 4615df3 commit f38cd9b
Show file tree
Hide file tree
Showing 59 changed files with 132 additions and 132 deletions.
Binary file modified aaanalysis/__pycache__/utils.cpython-39.pyc
Binary file not shown.
Binary file modified aaanalysis/feature_engineering/__pycache__/_aaclust.cpython-39.pyc
Binary file not shown.
Binary file not shown.
Binary file modified aaanalysis/feature_engineering/__pycache__/_cpp.cpython-39.pyc
Binary file not shown.
Binary file not shown.
38 changes: 18 additions & 20 deletions aaanalysis/feature_engineering/_aaclust.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
"""
This is a script for the AAclust clustering wrapper method.
This is a script for the interface of the AAclust class, used for clustering wrapper method.
"""
import numpy as np
from typing import Optional, Dict, Union, List, Tuple, Type
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.base import ClusterMixin
from sklearn.exceptions import ConvergenceWarning
import warnings
Expand All @@ -15,8 +14,9 @@
import aaanalysis.utils as ut

from ._backend.aaclust.aaclust_fit import estimate_lower_bound_n_clusters, optimize_n_clusters, merge_clusters
from ._backend.aaclust.aaclust_eval import bic_score
from ._backend.aaclust.aaclust_methods import (compute_centers, compute_medoids,
from ._backend.aaclust.aaclust_eval import evaluate_clustering
from ._backend.aaclust.aaclust_methods import (compute_centers,
compute_medoids,
name_clusters,
compute_correlation)

Expand Down Expand Up @@ -132,7 +132,7 @@ class AAclust(Wrapper):
def __init__(self,
model_class: Type[ClusterMixin] = KMeans,
model_kwargs: Optional[Dict] = None,
verbose: bool = False):
verbose: Optional[bool] = None):
# Model parameters
model_class = ut.check_mode_class(model_class=model_class)
if model_kwargs is None and model_class is KMeans:
Expand Down Expand Up @@ -310,7 +310,8 @@ def eval(self,
Notes
-----
BIC was modified to align with the SC and CH, so that higher values signify better clustering
BIC was adapted form this `StackExchange discussion <https://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans`
and modified to align with the SC and CH score so that higher values signify better clustering,
contrary to conventional BIC implementation favoring lower values. See [Breimann23a]_.
See Also
Expand All @@ -326,21 +327,18 @@ def eval(self,
labels = ut.check_labels(labels=labels)
ut.check_match_X_labels(X=X, labels=labels)

# Number of clusters (number of medoids)
# Get number of clusters (number of medoids) and evaluation measures
n_clusters = len(set(labels))
# Bayesian Information Criterion
BIC = bic_score(X, labels)
# Calinski-Harabasz Index
CH = calinski_harabasz_score(X, labels)
if np.isnan(CH):
CH = 0
warnings.warn("CH was set to 0 because sklearn.metric.calinski_harabasz_score returned NaN.", RuntimeWarning)
# Silhouette Coefficient
SC = silhouette_score(X, labels)
if np.isnan(SC):
SC = -1
warnings.warn("SC was set to -1 because sklearn.metric.silhouette_score returned NaN.", RuntimeWarning)
return n_clusters, BIC, CH, SC
bic, ch, sc = evaluate_clustering(X, labels=labels)
if np.isnan(ch):
ch = 0
if self._verbose:
warnings.warn("CH was set to 0 because sklearn.metric.calinski_harabasz_score returned NaN.", RuntimeWarning)
if np.isnan(sc):
sc = -1
if self._verbose:
warnings.warn("SC was set to -1 because sklearn.metric.silhouette_score returned NaN.", RuntimeWarning)
return n_clusters, bic, ch, sc

@staticmethod
def name_clusters(X: ut.ArrayLike2D,
Expand Down
4 changes: 2 additions & 2 deletions aaanalysis/feature_engineering/_aaclust_plot.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
This is a script for the plotting class of AAclust.
This is a script for the interface of the AAclustPlot class, used for plotting the results of AAclust.
"""
from sklearn.decomposition import PCA
from typing import Optional, Dict, Union, List, Tuple, Type
Expand All @@ -8,7 +8,7 @@
import aaanalysis as aa
import aaanalysis.utils as ut

from ._backend.aaclust_plot.aaclust_plot_eval import plot_eval
from ._backend.aaclust.aaclust_plot import plot_eval


# I Helper Functions
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""
This is a script for utility functions for aaclust object.
This is a script for utility functions for AAclust object and backend .
"""
import numpy as np
from collections import OrderedDict


# II Main Functions
Expand All @@ -19,5 +20,13 @@ def _cluster_medoid(X):
return ind_max


def _compute_centers(X, labels=None):
"""Obtain cluster centers and their labels"""
center_labels = list(OrderedDict.fromkeys(labels))
list_masks = [[True if i == label else False for i in labels] for label in center_labels]
centers = np.concatenate([_cluster_center(X[mask]) for mask in list_masks]).round(3)
return centers, np.array(center_labels)




32 changes: 17 additions & 15 deletions aaanalysis/feature_engineering/_backend/aaclust/aaclust_eval.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,18 @@
"""
This is a script for computing the Bayesian Information Criterion (BIC) used in the AAclust.eval() method.
This is a script for the backend of the AAclust.eval method.
"""
import numpy as np
from scipy.spatial import distance
from sklearn.metrics import silhouette_score, calinski_harabasz_score

from .aaclust_methods import compute_centers
from ._utils_aaclust import _compute_centers

# I Helper Functions


# II Main function
def bic_score(X, labels=None):
"""Computes the BIC metric for given clusters.
Returns
-------
BIC : float
BIC value between -inf and inf. Greater values indicate better clustering.
See also
--------
https://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans
"""
"""Computes the Bayesian Information Criterion (BIC) metric for given clusters."""
epsilon = 1e-10 # prevent division by zero

# Check if labels match to number of clusters
Expand All @@ -35,7 +26,7 @@ def bic_score(X, labels=None):
# Map labels to increasing order starting with 0
unique_labels, inverse = np.unique(labels, return_inverse=True)
labels = inverse
centers, center_labels = compute_centers(X, labels=labels)
centers, center_labels = _compute_centers(X, labels=labels)
size_clusters = np.bincount(labels)

# Compute variance over all clusters
Expand All @@ -55,4 +46,15 @@ def bic_score(X, labels=None):

bic_components = size_clusters * (log_size_clusters - log_n_samples) - 0.5 * size_clusters * n_features * log_bcv - 0.5 * (size_clusters - 1) * n_features
bic = np.sum(bic_components) - const_term
return bic
return bic


def evaluate_clustering(X, labels=None):
"""Evaluate clustering results using BIC, CH, SC scores"""
# Bayesian Information Criterion
bic = bic_score(X, labels)
# Calinski-Harabasz Index
ch = calinski_harabasz_score(X, labels)
# Silhouette Coefficient
sc = silhouette_score(X, labels)
return bic, ch, sc
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""
This is a script for three main steps of the AAclust algorithm and further helper functions.
This is a script for the backend of the AAclust.fit() method.
The fit functions performs the AAclust algorithm consisting of three steps:
AAclust algorithm steps
1. Estimate lower bound for n_clusters
2. Optimization of n_clusters
3. Merge clusters
Expand All @@ -17,7 +18,7 @@

# I Helper Functions


# TODO simplify documenation (DRY)
# Compute minimum correlation on center or all scales
def min_cor_center(X):
"""Get minimum for correlation of all columns with cluster center, defined as the mean values
Expand Down
Original file line number Diff line number Diff line change
@@ -1,25 +1,23 @@
"""
This is a script for the AAclust methods.
This is a script for the backend of various AAclust methods.
"""
import pandas as pd
import numpy as np
from collections import OrderedDict

import aaanalysis.utils as ut
from ._utils_aaclust import _cluster_center, _cluster_medoid
from ._utils_aaclust import _cluster_medoid, _compute_centers

# Settings

# I Helper function


# II Main Functions
def compute_centers(X, labels=None):
"""Obtain cluster centers and their labels"""
center_labels = list(OrderedDict.fromkeys(labels))
list_masks = [[True if i == label else False for i in labels] for label in center_labels]
centers = np.concatenate([_cluster_center(X[mask]) for mask in list_masks]).round(3)
return centers, np.array(center_labels)
# Function in utilis for not breaking dependency rules:
# Backend functions should only depend on backend utility functions
return _compute_centers(X, labels=labels)


def compute_medoids(X, labels=None):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,24 +1,51 @@
"""
This is a script for ...
This is a script for the AAclust plot_eval method.
"""
import warnings

import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import matplotlib as mpl

import aaanalysis._utils as ut
import numpy as np

# Settings
pd.set_option('expand_frame_repr', False) # Single line print for pd.Dataframe
import aaanalysis.utils as ut


# I Helper Functions

def _get_rank(data):
""""""
_df = data.copy()
_df['BIC_rank'] = _df['BIC'].rank(ascending=False)
_df['CH_rank'] = _df['CH'].rank(ascending=False)
_df['SC_rank'] = _df['SC'].rank(ascending=False)
return _df[['BIC_rank', 'CH_rank', 'SC_rank']].mean(axis=1).round(2)

# II Main Functions
def plot_eval(data=None, names=None, dict_xlims=None, figsize=None, columns=None, colors=None):
""""""
data = pd.DataFrame(data, columns=columns, index=names)
data["rank"] = _get_rank(data)
data = data.sort_values(by="rank", ascending=True)
# Plotting
fig, axes = plt.subplots(1, 4, sharey=True, figsize=figsize)
for i, col in enumerate(columns):
ax = axes[i]
sns.barplot(ax=ax, data=data, y=data.index, x=col, color=colors[i])
# Customize subplots
ax.set_ylabel("")
ax.set_xlabel(col)
ax.axvline(0, color='black') # , linewidth=aa.plot_gcfs("axes.linewidth"))
if dict_xlims and col in dict_xlims:
ax.set_xlim(dict_xlims[col])
if i == 0:
ax.set_title("Number of clusters", weight="bold")
elif i == 2:
ax.set_title("Quality measures", weight="bold")
sns.despine(ax=ax, left=True)
ax.tick_params(axis='y', which='both', left=False)
plt.tight_layout()
plt.subplots_adjust(wspace=0.25, hspace=0)
return fig, axes

def _plot_pca(df_pred=None, filter_classes=None, x=None, y=None, others=True, highlight_rel=True,
figsize=(6, 6), highlight_mean=True, list_classes=None):
""""""
Expand Down
Empty file.
Binary file not shown.
Binary file not shown.

This file was deleted.

Binary file not shown.
Empty file.
Binary file not shown.
9 changes: 7 additions & 2 deletions aaanalysis/feature_engineering/_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,13 @@ class CPP(Tool):
The CPP.run() method performs all steps of the CPP algorithm.
"""
def __init__(self, df_scales=None, df_cat=None, df_parts=None, split_kws=None,
accept_gaps=False, verbose=True):
def __init__(self,
df_scales=None,
df_cat=None,
df_parts=None,
split_kws=None,
accept_gaps=False,
verbose=None):
# Load default scales if not specified
sf = SequenceFeature()
if df_cat is None:
Expand Down
2 changes: 1 addition & 1 deletion aaanalysis/feature_engineering/_cpp_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import aaanalysis as aa
import aaanalysis.utils as ut

from ._backend.cpp_plot.cpp_plot_method import CPPPlots, get_optimal_fontsize
from ._backend.cpp.cpp_plot import CPPPlots, get_optimal_fontsize


# Settings
Expand Down
Empty file.
13 changes: 6 additions & 7 deletions aaanalysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,14 +164,13 @@ def read_csv_cached(name, sep=None):

# Main check functions
def check_verbose(verbose):
check_bool(name="verbose", val=verbose)
# System level verbosity
verbose_value = options['verbose']
# If system level is negative return
if not verbose_value:
return verbose_value
if verbose is None:
# System level verbosity
verbose = options['verbose']
else:
return verbose
check_bool(name="verbose", val=verbose)
return verbose


# TODO check each of this checking function (make simpler)
# Check key dataframes using constants and general checking functions
Expand Down
Binary file modified docs/build/doctrees/environment.pickle
Binary file not shown.
Binary file modified docs/build/doctrees/generated/aaanalysis.AAclust.doctree
Binary file not shown.
Binary file modified docs/build/doctrees/generated/aaanalysis.CPP.doctree
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit f38cd9b

Please sign in to comment.