Skip to content

Commit

Permalink
Add metric parameter to AAclust().comp_medoids
Browse files Browse the repository at this point in the history
  • Loading branch information
breimanntools committed Oct 9, 2023
1 parent d3bfe66 commit 941cadb
Show file tree
Hide file tree
Showing 80 changed files with 431 additions and 203 deletions.
Binary file modified aaanalysis/__pycache__/config.cpython-39.pyc
Binary file not shown.
Binary file modified aaanalysis/__pycache__/utils.cpython-39.pyc
Binary file not shown.
Binary file modified aaanalysis/_utils/__pycache__/check_data.cpython-39.pyc
Binary file not shown.
Binary file modified aaanalysis/_utils/__pycache__/utils_output.cpython-39.pyc
Binary file not shown.
Binary file not shown.
4 changes: 2 additions & 2 deletions aaanalysis/_utils/check_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,11 @@ def check_labels(labels=None):
return labels


def check_match_X_labels(X=None, labels=None):
def check_match_X_labels(X=None, X_name="X", labels=None, labels_name="labels"):
""""""
n_samples, n_features = X.shape
if n_samples != len(labels):
raise ValueError(f"n_samples does not match for 'X' ({len(X)}) and 'labels' ({len(labels)}).")
raise ValueError(f"n_samples does not match for '{X_name}' ({len(X)}) and '{labels_name}' ({len(labels)}).")

# Check sets
def check_superset_subset(subset=None, superset=None, name_subset=None, name_superset=None):
Expand Down
12 changes: 11 additions & 1 deletion aaanalysis/_utils/utils_ploting.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
"""
This is a script for internal plotting utility functions used in the backend.
"""
"""
import seaborn as sns

# Main function
def plot_gco(option='font.size', show_options=False):
"""Get current option from plotting context"""
current_context = sns.plotting_context()
if show_options:
print(current_context)
option_value = current_context[option] # Typically font_size
return option_value
4 changes: 3 additions & 1 deletion aaanalysis/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
"""
from typing import Dict, Any

# System level options
verbose = True

# Enables setting of system level variables like in matplotlib
class Settings:
def __init__(self):
self._settings: Dict[str, Any] = {
'verbose': True
'verbose': verbose,
}

def __getitem__(self, key: str) -> Any:
Expand Down
Binary file modified aaanalysis/feature_engineering/__pycache__/_aaclust.cpython-39.pyc
Binary file not shown.
Binary file not shown.
68 changes: 35 additions & 33 deletions aaanalysis/feature_engineering/_aaclust.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,6 @@
compute_correlation)

# I Helper Functions
# Check parameter functions
def check_merge_metric(merge_metric=None):
""""""
if merge_metric is not None and merge_metric not in ut.LIST_METRICS:
error = f"'merge_metric' should be None or one of following: {ut.LIST_METRICS}"
raise ValueError(error)


# Check parameter matching functions
def check_match_X_names(X=None, names=None, accept_none=True):
""""""
Expand Down Expand Up @@ -122,11 +114,11 @@ class AAclust(Wrapper):
Cluster labels in the order of samples in ``X``.
centers_ : `array-like, shape (n_clusters, n_features)`
Average scale values corresponding to each cluster.
center_labels_ : `array-like, shape (n_clusters, )`
labels_centers_ : `array-like, shape (n_clusters, )`
Cluster labels for each cluster center.
medoids_ : `array-like, shape (n_clusters, n_features)`
Representative samples, one for each cluster.
medoid_labels_ : `array-like, shape (n_clusters, )`
labels_medoids_ : `array-like, shape (n_clusters, )`
Cluster labels for each medoid.
is_medoid_ : `array-like, shape (n_samples, )`
Array indicating samples being medoids (1) or not (0). Same order as ``labels_``.
Expand Down Expand Up @@ -161,9 +153,9 @@ def __init__(self,
self.n_clusters: Optional[int] = None
self.labels_: Optional[ut.ArrayLike1D] = None
self.centers_: Optional[ut.ArrayLike1D] = None
self.center_labels_: Optional[ut.ArrayLike1D] = None
self.labels_centers_: Optional[ut.ArrayLike1D] = None
self.medoids_: Optional[ut.ArrayLike1D] = None
self.medoid_labels_: Optional[ut.ArrayLike1D] = None
self.labels_medoids_: Optional[ut.ArrayLike1D] = None
self.is_medoid_: Optional[ut.ArrayLike1D] = None
self.medoid_names_: Optional[List[str]] = None

Expand All @@ -174,7 +166,8 @@ def fit(self,
n_clusters: Optional[int] = None,
on_center: bool = True,
min_th: float = 0.3,
merge_metric: Union[str, None] = "euclidean",
merge: bool = True,
metric: str = "euclidean",
names: Optional[List[str]] = None) -> "AAclust":
"""
Applies AAclust algorithm to feature matrix (``X``).
Expand All @@ -194,14 +187,15 @@ def fit(self,
Pearson correlation threshold for clustering (between 0 and 1).
on_center
If ``True``, ``min_th`` is applied to the cluster center. Otherwise, to all cluster members.
merge_metric
Metric used as similarity measure for optional cluster merging:
merge
If ``True``, the optional merging step is performed.
metric
Metric used as similarity measure for optional cluster merging and obtained medoids:
- ``None``: No merging is performed
- ``correlation``: Pearson correlation
- ``euclidean``: Euclidean distance
- ``manhattan``: Manhattan distance
- ``cosine``: Cosine distance
- ``correlation``: Pearson correlation (maximum)
- ``euclidean``: Euclidean distance (minimum)
- ``manhattan``: Manhattan distance (minimum)
- ``cosine``: Cosine distance (minimum)
names
List of sample names. If provided, sets :attr:`AAclust.medoid_names_` attribute.
Expand Down Expand Up @@ -236,7 +230,7 @@ def fit(self,
names = ut.check_list_like(name="names", val=names, accept_none=True)
ut.check_number_range(name="mint_th", val=min_th, min_val=0, max_val=1, just_int=False, accept_none=False)
ut.check_number_range(name="n_clusters", val=n_clusters, min_val=1, just_int=True, accept_none=True)
check_merge_metric(merge_metric=merge_metric)
ut.check_metric(metric=metric)
ut.check_bool(name="on_center", val=on_center)

check_match_X_n_clusters(X=X, n_clusters=n_clusters, accept_none=True)
Expand All @@ -263,10 +257,10 @@ def fit(self,
self.model = self.model_class(n_clusters=n_clusters, **self._model_kwargs)
labels = self.model.fit(X).labels_.tolist()
# Step 3. Cluster merging (optional)
if merge_metric is not None:
if metric is not None:
if self._verbose:
ut.print_out(f"3. Cluster merging (k={len(labels)})", end="")
labels = merge_clusters(X, labels=labels, min_th=min_th, on_center=on_center, metric=merge_metric)
labels = merge_clusters(X, labels=labels, min_th=min_th, on_center=on_center, metric=metric)
n_clusters = len(set(labels))

# Obtain cluster centers and medoids
Expand All @@ -278,9 +272,9 @@ def fit(self,
self.n_clusters = len(set(labels))
self.labels_ = np.array(labels)
self.centers_ = centers
self.center_labels_ = center_labels
self.labels_centers_ = center_labels
self.medoids_ = medoids # Representative scales
self.medoid_labels_ = medoid_labels
self.labels_medoids_ = medoid_labels
self.is_medoid_ = np.array([i in medoid_ind for i in range(0, len(labels))])
if names is not None:
self.medoid_names_ = [names[i] for i in medoid_ind]
Expand Down Expand Up @@ -341,7 +335,6 @@ def eval(self,
labels = self.labels_
labels = ut.check_labels(labels=labels)
ut.check_match_X_labels(X=X, labels=labels)

# Get number of clusters (number of medoids) and evaluation measures
n_clusters = len(set(labels))
bic, ch, sc = evaluate_clustering(X, labels=labels)
Expand Down Expand Up @@ -413,7 +406,7 @@ def comp_centers(X: ut.ArrayLike2D,
-------
centers : `array-like, shape (n_clusters, )`
The computed center for each cluster.
center_labels : `array-like, shape (n_clusters, )`
labels_centers : `array-like, shape (n_clusters, )`
The labels associated with each computed center.
"""
# Check input
Expand All @@ -422,14 +415,15 @@ def comp_centers(X: ut.ArrayLike2D,
labels = ut.check_labels(labels=labels)
ut.check_match_X_labels(X=X, labels=labels)
# Get cluster centers
centers, center_labels = compute_centers(X, labels=labels)
return centers, center_labels
centers, labels_centers = compute_centers(X, labels=labels)
return centers, labels_centers

@staticmethod
@ut.doc_params(doc_param_X=doc_param_X,
doc_param_labels=doc_param_labels)
def comp_medoids(X: ut.ArrayLike2D,
labels: ut.ArrayLike1D = None
labels: ut.ArrayLike1D = None,
metric: str = "correlation"
) -> Tuple[ut.ArrayLike1D, ut.ArrayLike1D]:
"""
Computes the medoid of each cluster based on the given labels.
Expand All @@ -438,22 +432,30 @@ def comp_medoids(X: ut.ArrayLike2D,
----------
{doc_param_X}
{doc_param_labels}
metric
Metric used as similarity measure to obtain medoids:
- ``correlation``: Pearson correlation (maximum)
- ``euclidean``: Euclidean distance (minimum)
- ``manhattan``: Manhattan distance (minimum)
- ``cosine``: Cosine distance (minimum)
Returns
-------
medoids : `array-like, shape (n_clusters, )`
The medoid for each cluster.
medoid_labels : `array-like, shape (n_clusters, )`
labels_medoids : `array-like, shape (n_clusters, )`
The labels corresponding to each medoid.
"""
# Check input
X = ut.check_X(X=X)
ut.check_X_unique_samples(X=X)
labels = ut.check_labels(labels=labels)
ut.check_match_X_labels(X=X, labels=labels)
ut.check_metric(metric=metric)
# Get cluster medoids
medoids, medoid_labels, _ = compute_medoids(X, labels=labels)
return medoids, medoid_labels
medoids, labels_medoids, _ = compute_medoids(X, labels=labels, metric=metric)
return medoids, labels_medoids

@staticmethod
@ut.doc_params(doc_param_X=doc_param_X,
Expand Down
68 changes: 60 additions & 8 deletions aaanalysis/feature_engineering/_aaclust_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import aaanalysis as aa
import aaanalysis.utils as ut

from ._backend.aaclust.aaclust_plot import plot_eval
from ._backend.aaclust.aaclust_plot import plot_eval, plot_center_or_medoid, plot_correlation


# I Helper Functions
Expand Down Expand Up @@ -59,6 +59,7 @@ def check_dict_xlims(dict_xlims=None):


# TODO add check functions finish other methods, testing, compression

# II Main Functions
class AAclustPlot:
"""Plot results of AAclust analysis.
Expand Down Expand Up @@ -89,7 +90,7 @@ def __init__(self,
self.model_kwargs = model_kwargs

@staticmethod
def eval(data: ut.ArrayLike2D,
def eval(data_eval: ut.ArrayLike2D,
names: Optional[List[str]] = None,
dict_xlims: Optional[Union[None, dict]] = None,
figsize: Optional[Tuple[int, int]] = (7, 6)
Expand All @@ -101,7 +102,7 @@ def eval(data: ut.ArrayLike2D,
Parameters
----------
data : `array-like, shape (n_samples, n_features)`
data_eval : `array-like, shape (n_samples, n_features)`
Evaluation matrix or DataFrame. `Rows` correspond to scale sets and `columns` to the following
four evaluation measures:
Expand Down Expand Up @@ -134,9 +135,9 @@ def eval(data: ut.ArrayLike2D,
* :meth:`AAclust.eval` for details on evaluation measures.
"""
# Check input
ut.check_array_like(name="data", val=data)
ut.check_array_like(name="data", val=data_eval)
ut.check_list_like(name="names", val=names, accept_none=True)
df_eval = check_match_data_names(data=data, names=names)
df_eval = check_match_data_names(data=data_eval, names=names)
check_dict_xlims(dict_xlims=dict_xlims)
ut.check_tuple(name="figsize", val=figsize, n=2, accept_none=True)
# Plotting
Expand All @@ -148,11 +149,62 @@ def eval(data: ut.ArrayLike2D,
return fig, axes


def center(self, data):
def center(self,
X: ut.ArrayLike2D,
labels: ut.ArrayLike1D = None,
figsize: Optional[Tuple[int, int]] = (7, 6),
dot_alpha: Optional[float] = 0.75,
dot_size: Optional[int] = 100,
component_x : Optional[int] = 1,
component_y : Optional[int] = 2,
) -> pd.DataFrame:
"""PCA plot of clustering with centers highlighted"""

def medoids(self, data):
# Check input
X = ut.check_X(X=X)
ut.check_X_unique_samples(X=X)
labels = ut.check_labels(labels=labels)
ut.check_match_X_labels(X=X, labels=labels)
ut.check_number_range(name="component_x", val=component_x, accept_none=False, min_val=1, just_int=True)
ut.check_number_range(name="component_y", val=component_y, accept_none=False, min_val=1, just_int=True)
ut.check_tuple(name="figsize", val=figsize, n=2, accept_none=True)
ut.check_number_range(name="dot_alpha", val=dot_alpha, accept_none=False, min_val=0, max_val=1, just_int=False)
ut.check_number_range(name="dot_size", val=dot_size, accept_none=False, min_val=1, just_int=True)
# Create plot
df_components = plot_center_or_medoid(X, labels=labels, plot_centers=True,
component_x=component_x, component_y=component_y,
model_class=self.model_class, model_kwargs=self.model_kwargs,
figsize=figsize, dot_size=dot_size, dot_alpha=dot_alpha)
return df_components

def medoids(self,
X: ut.ArrayLike2D,
labels: ut.ArrayLike1D = None,
figsize: Optional[Tuple[int, int]] = (7, 6),
dot_alpha: Optional[float] = 0.75,
dot_size: Optional[int] = 100,
component_x : Optional[int] = 1,
component_y : Optional[int] = 2,
metric: Optional[str] = "euclidean",
) -> pd.DataFrame:
"""PCA plot of clustering with medoids highlighted"""
# Check input
X = ut.check_X(X=X)
ut.check_X_unique_samples(X=X)
labels = ut.check_labels(labels=labels)
ut.check_match_X_labels(X=X, labels=labels)
ut.check_number_range(name="component_x", val=component_x, accept_none=False, min_val=1, just_int=True)
ut.check_number_range(name="component_y", val=component_y, accept_none=False, min_val=1, just_int=True)
ut.check_tuple(name="figsize", val=figsize, n=2, accept_none=True)
ut.check_number_range(name="dot_alpha", val=dot_alpha, accept_none=False, min_val=0, max_val=1, just_int=False)
ut.check_number_range(name="dot_size", val=dot_size, accept_none=False, min_val=1, just_int=True)
ut.check_metric(metric=metric)
# Create plot
df_components = plot_center_or_medoid(X, labels=labels, plot_centers=False, component_x=component_x,
component_y=component_y, metric=metric, model_class=self.model_class,
model_kwargs=self.model_kwargs, figsize=figsize, dot_size=dot_size,
dot_alpha=dot_alpha)

return df_components

@staticmethod
def correlation(df_corr=None):
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
46 changes: 32 additions & 14 deletions aaanalysis/feature_engineering/_backend/aaclust/_utils_aaclust.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,48 @@
"""
import numpy as np
from collections import OrderedDict

from sklearn.metrics import pairwise_distances

# II Main Functions
def _cluster_center(X):
"""Compute cluster center (i.e., arithmetical mean over all data points/observations of a cluster)"""
return X.mean(axis=0)[np.newaxis, :]


def _cluster_medoid(X):
def _cluster_medoid(X, metric="correlation"):
"""Obtain cluster medoids (i.e., scale closest to cluster center used as representative scale for a cluster)"""
# Create new array with cluster center and given array
center_X = np.concatenate([_cluster_center(X), X], axis=0)
# Get index for scale with the highest correlation with cluster center
ind_max = np.corrcoef(center_X)[0, 1:].argmax()
return ind_max
center = _cluster_center(X).reshape(1, -1)
if metric == "correlation":
# Create new array with cluster center and given array
center_X = np.concatenate([_cluster_center(X), X], axis=0)
# Get index for scale with the highest correlation with cluster center
medoid_index = np.corrcoef(center_X)[0, 1:].argmax()
else:
# Calculating pairwise distances from center to all points in X
distances = pairwise_distances(center, X, metric=metric)
# Finding the index of the point with the minimum distance to the center
medoid_index = np.argmin(distances)
return medoid_index


def _compute_centers(X, labels=None):
"""Obtain cluster centers and their labels"""
center_labels = list(OrderedDict.fromkeys(labels))
list_masks = [[True if i == label else False for i in labels] for label in center_labels]
labels_centers = list(OrderedDict.fromkeys(labels))
list_masks = [[True if i == label else False for i in labels] for label in labels_centers]
centers = np.concatenate([_cluster_center(X[mask]) for mask in list_masks]).round(3)
return centers, np.array(center_labels)




labels_centers = np.array(labels_centers)
return centers, labels_centers

def _compute_medoids(X, labels=None, metric="correlation"):
"""Obtain cluster medoids and their labels"""
unique_labels = list(OrderedDict.fromkeys(labels))
list_masks = [[True if i == label else False for i in labels] for label in unique_labels]
# Calculating medoid for each mask using specified metric
list_ind_max = [_cluster_medoid(X[mask], metric=metric) for mask in list_masks]
indices = np.array(range(0, len(labels)))
# Finding global indices of medoids
medoid_ind = [indices[m][i] for m, i in zip(list_masks, list_ind_max)]
# Finding labels and data of medoids
labels_medoids = np.array([labels[i] for i in medoid_ind])
medoids = np.array([X[i, :] for i in medoid_ind])
return medoids, labels_medoids, medoid_ind
Loading

0 comments on commit 941cadb

Please sign in to comment.