diff --git a/aaanalysis/__pycache__/utils.cpython-39.pyc b/aaanalysis/__pycache__/utils.cpython-39.pyc index 93844be0..8f096ef8 100644 Binary files a/aaanalysis/__pycache__/utils.cpython-39.pyc and b/aaanalysis/__pycache__/utils.cpython-39.pyc differ diff --git a/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc b/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc index 1c0a77f9..8420f63e 100644 Binary files a/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc differ diff --git a/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc b/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc index e38ecd09..3882a91b 100644 Binary files a/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc differ diff --git a/aaanalysis/_utils/_utils_check.py b/aaanalysis/_utils/_utils_check.py index 6e136c27..56bd1f31 100644 --- a/aaanalysis/_utils/_utils_check.py +++ b/aaanalysis/_utils/_utils_check.py @@ -80,14 +80,13 @@ def check_tuple(name=None, val=None, n=None): # Array checking functions def check_feat_matrix(X=None, names=None, labels=None): - """Check if X and y match (y can be labels or names). Otherwise, transpose X or give error.""" - # TODO type check - X = check_array(X) + """Transpose matrix and check if X and y match (y can be labels or names). Transpose back otherwise """ + X = check_array(X).transpose() if labels is not None: check_consistent_length(X, labels) n_samples, n_features = X.shape if n_samples == 0 or n_features == 0: - raise ValueError(f"Shape of X ({n_samples}, {n_features}) indicates empty feature matrix.") + raise ValueError(f"Shape of 'X' ({n_samples}, {n_features}) indicates empty feature matrix.") if names is None: return X, names else: diff --git a/aaanalysis/_utils/utils_aaclust.py b/aaanalysis/_utils/utils_aaclust.py index 26c45173..6974766e 100644 --- a/aaanalysis/_utils/utils_aaclust.py +++ b/aaanalysis/_utils/utils_aaclust.py @@ -37,3 +37,9 @@ def check_merge_metric(merge_metric=None): error = f"'merge_metric' should be None or one of following: {LIST_METRICS}" raise ValueError(error) return merge_metric + +def check_feat_matrix_n_clust_match(X=None, n_clusters=None): + """""" + n_samples, n_features = X.shape + if n_samples <= n_clusters: + raise ValueError(f"'X' must contain more samples ({n_samples}) then 'n_clusters' ({n_clusters})") diff --git a/aaanalysis/aaclust/__pycache__/aaclust.cpython-39.pyc b/aaanalysis/aaclust/__pycache__/aaclust.cpython-39.pyc index 0c20615d..b68040b4 100644 Binary files a/aaanalysis/aaclust/__pycache__/aaclust.cpython-39.pyc and b/aaanalysis/aaclust/__pycache__/aaclust.cpython-39.pyc differ diff --git a/aaanalysis/aaclust/aaclust.py b/aaanalysis/aaclust/aaclust.py index 6b5628b0..a0833b84 100644 --- a/aaanalysis/aaclust/aaclust.py +++ b/aaanalysis/aaclust/aaclust.py @@ -11,6 +11,8 @@ # I Helper Functions + + # Obtain centroids and medoids def cluster_center(X): """Compute cluster center (i.e., arithmetical mean over all data points/observations of a cluster)""" @@ -29,7 +31,7 @@ def _cluster_medoid(X): """Obtain cluster medoids (i.e., scale closest to cluster center used as representative scale for a cluster)""" # Create new array with cluster center and given center_X = np.concatenate([cluster_center(X), X], axis=0) - # Get index for scale with highest correlation with cluster center + # Get index for scale with the highest correlation with cluster center ind_max = np.corrcoef(center_X)[0, 1:].argmax() return ind_max @@ -410,6 +412,7 @@ def fit(self, X, names=None, on_center=True, min_th=0, merge_metric="euclidean" ut.check_min_th(min_th=min_th) merge_metric = ut.check_merge_metric(merge_metric=merge_metric) X, names = ut.check_feat_matrix(X=X, names=names) + ut.check_feat_matrix_n_clust_match(X=X, n_clusters=n_clusters) args = dict(model=self.model, model_kwargs=self._model_kwargs, min_th=min_th, on_center=on_center) # Clustering using given clustering models if n_clusters is not None: diff --git a/aaanalysis/cpp/__pycache__/cpp.cpython-39.pyc b/aaanalysis/cpp/__pycache__/cpp.cpython-39.pyc index 9d669888..88affe85 100644 Binary files a/aaanalysis/cpp/__pycache__/cpp.cpython-39.pyc and b/aaanalysis/cpp/__pycache__/cpp.cpython-39.pyc differ diff --git a/aaanalysis/utils.py b/aaanalysis/utils.py index 2bc53737..f512ce89 100644 --- a/aaanalysis/utils.py +++ b/aaanalysis/utils.py @@ -13,6 +13,7 @@ check_feat_matrix, check_col_in_df) from aaanalysis._utils._utils_output import (print_red, print_start_progress, print_progress, print_finished_progress) from aaanalysis._utils.utils_aaclust import (check_model, check_min_th, check_merge_metric, + check_feat_matrix_n_clust_match, METRIC_CORRELATION, LIST_METRICS) from aaanalysis._utils.utils_cpp import (check_color, check_y_categorical, check_labels, check_ylim, check_args_len, check_args_len, check_list_parts, diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle index 797868a7..bdff3fef 100644 Binary files a/docs/build/doctrees/environment.pickle and b/docs/build/doctrees/environment.pickle differ diff --git a/docs/build/doctrees/generated/aaanalysis.AAclust.doctree b/docs/build/doctrees/generated/aaanalysis.AAclust.doctree index 706d8507..c5989ddb 100644 Binary files a/docs/build/doctrees/generated/aaanalysis.AAclust.doctree and b/docs/build/doctrees/generated/aaanalysis.AAclust.doctree differ diff --git a/docs/build/doctrees/generated/tutorial1_quick_start.doctree b/docs/build/doctrees/generated/tutorial1_quick_start.doctree index 81728585..47be535c 100644 Binary files a/docs/build/doctrees/generated/tutorial1_quick_start.doctree and b/docs/build/doctrees/generated/tutorial1_quick_start.doctree differ diff --git a/docs/build/html/_images/output_13_1.png b/docs/build/html/_images/output_13_1.png index d7e83427..4a4b3449 100644 Binary files a/docs/build/html/_images/output_13_1.png and b/docs/build/html/_images/output_13_1.png differ diff --git a/docs/build/html/_sources/generated/tutorial1_quick_start.rst.txt b/docs/build/html/_sources/generated/tutorial1_quick_start.rst.txt index 23e6ef58..419f2e19 100644 --- a/docs/build/html/_sources/generated/tutorial1_quick_start.rst.txt +++ b/docs/build/html/_sources/generated/tutorial1_quick_start.rst.txt @@ -34,9 +34,8 @@ available at your fingertips with the ``aa.load_scales()`` function. .. code:: ipython3 import aaanalysis as aa - # Load scales and scale categories (AAontology) + df_scales = aa.load_scales() - # Load training data df_seq = aa.load_dataset(name="DOM_GSEC", n=50) df_seq.head(5) @@ -156,11 +155,12 @@ set of 100 scales, as defined by the ``n_clusters`` parameters: from sklearn.cluster import AgglomerativeClustering import numpy as np - aac = aa.AAclust(model=AgglomerativeClustering, model_kwargs=dict(linkage="ward")) + + aac = aa.AAclust(model=AgglomerativeClustering) X = np.array(df_scales) - scales = aac.fit(X, n_clusters=100, names=list(df_scales)) + scales = aac.fit(X, names=list(df_scales), n_clusters=100) df_scales = df_scales[scales] - df_scales + df_scales[scales[0:4]].head(5) @@ -189,23 +189,6 @@ set of 100 scales, as defined by the ``n_clusters`` parameters:
20 rows × 100 columns
@@ -731,15 +251,15 @@ sequences: the test set and the reference set. Supported by the C-terminal adjacent regions (JMD-N and JMD-C, respectively), obtained ``sf.get_df_parts``. - ``Splits``: These ``Parts`` can be split into various continuous segments or discontinuous patterns, specified -``sf.get_split_kws()``. - ``Scales``: Sets of amino acid scales. We -first use SequenceFeature to obtain Parts and Splits: +``sf.get_split_kws()``. - ``Scales``: Sets of amino acid scales. + +We use SequenceFeature to obtain Parts and Splits: .. code:: ipython3 - # Feature Engineering y = list(df_seq["label"]) sf = aa.SequenceFeature() - df_parts = sf.get_df_parts(df_seq=df_seq, jmd_n_len=10, jmd_c_len=10, list_parts=["tmd_jmd"]) + df_parts = sf.get_df_parts(df_seq=df_seq, list_parts=["tmd_jmd"]) split_kws = sf.get_split_kws(n_split_max=1, split_types=["Segment"]) df_parts.head(5) @@ -803,9 +323,9 @@ As a baseline approach, we use CPP to compute the average values for the .. code:: ipython3 - # Small set of features (100 features created) - cpp = aa.CPP(df_parts=df_parts, df_scales=df_scales, split_kws=split_kws, verbose=False) - df_feat = cpp.run(labels=y, tmd_len=20, jmd_n_len=10, jmd_c_len=10, n_filter=100) # Default values for lengths are used + # Small set of CPP features (100 features are created) + cpp = aa.CPP(df_scales=df_scales, df_parts=df_parts, split_kws=split_kws, verbose=False) + df_feat = cpp.run(labels=y) df_feat @@ -927,16 +447,16 @@ A feature matrix from a given set of CPP features can be created using from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score - X = sf.feat_matrix(df_parts=df_parts, df_scales=df_scales, features=df_feat["feature"]) - # ML evaluation + + X = sf.feat_matrix(df_parts=df_parts, features=df_feat["feature"]) rf = RandomForestClassifier() - cv_base = cross_val_score(rf, X, y, scoring="accuracy", cv=5, n_jobs=8) # Set n_jobs=1 to disable multi-processing + cv_base = cross_val_score(rf, X, y, scoring="accuracy") print(f"Mean accuracy of {round(np.mean(cv_base), 2)}") .. parsed-literal:: - Mean accuracy of 0.57 + Mean accuracy of 0.58 Creating more features with CPP will take some more time. but improve @@ -944,12 +464,11 @@ prediction performance: .. code:: ipython3 - # Default CPP features (around 100.000 features) - split_kws = sf.get_split_kws() - df_parts = sf.get_df_parts(df_seq=df_seq, jmd_n_len=10, jmd_c_len=10) - cpp = aa.CPP(df_parts=df_parts, df_scales=df_scales, split_kws=split_kws, verbose=False) - df_feat = cpp.run(labels=y, n_processes=8, n_filter=100) - df_feat + # CPP features with default splits (around 100.000 features) + df_parts = sf.get_df_parts(df_seq=df_seq) + cpp = aa.CPP(df_scales=df_scales, df_parts=df_parts, verbose=False) + df_feat = cpp.run(labels=y) + df_feat.head(10) @@ -1071,104 +590,87 @@ prediction performance:100 rows × 13 columns
@@ -1182,21 +684,23 @@ Which can be again used for machine learning: warnings.simplefilter(action='ignore', category=FutureWarning) import matplotlib.pyplot as plt import pandas as pd - X = sf.feat_matrix(df_parts=df_parts, df_scales=df_scales, features=df_feat["feature"]) - # ML evaluation + + X = sf.feat_matrix(df_parts=df_parts, features=df_feat["feature"]) rf = RandomForestClassifier() cv = cross_val_score(rf, X, y, scoring="accuracy", cv=5, n_jobs=1) print(f"Mean accuracy of {round(np.mean(cv), 2)}") + aa.plot_settings(font_scale=1.1) sns.barplot(pd.DataFrame({"Baseline": cv_base, "CPP": cv}), palette=["tab:blue", "tab:red"]) plt.ylabel("Mean accuracy", size=aa.plot_gcfs()+1) + plt.ylim(0, 1) sns.despine() plt.show() .. parsed-literal:: - Mean accuracy of 0.95 + Mean accuracy of 0.9 diff --git a/docs/build/html/generated/aaanalysis.AAclust.html b/docs/build/html/generated/aaanalysis.AAclust.html index 51764f6c..2ecc5c34 100644 --- a/docs/build/html/generated/aaanalysis.AAclust.html +++ b/docs/build/html/generated/aaanalysis.AAclust.html @@ -128,7 +128,7 @@Bases: object
AAclust: A k-optimized clustering framework for selecting redundancy-reduced set of numerical scales.
AAclust is designed primarily for amino acid scales but is versatile enough for any set of numerical indices. @@ -224,7 +224,7 @@
Methods
@@ -255,7 +255,7 @@Fit the AAclust model on the data, optimizing cluster formation using Pearson correlation.
AAclust determines the optimal number of clusters, k, without pre-specification. It partitions data(X) into clusters by maximizing the within-cluster Pearson correlation beyond the ‘min_th’ threshold. The quality of @@ -296,7 +296,7 @@
Assigns names to clusters based on scale names and their frequency.
This method renames clusters based on the names of the scales in each cluster, with priority given to the most frequent scales. If the name is already used or does not exist, it defaults to ‘name_unclassified’.
@@ -319,7 +319,7 @@Computes the center of each cluster based on the given labels.
Computes the medoid of each cluster based on the given labels.
Computes the correlation of test data with reference cluster centers.
aa.load_scales()
function.
import aaanalysis as aa
-# Load scales and scale categories (AAontology)
+
df_scales = aa.load_scales()
-# Load training data
df_seq = aa.load_dataset(name="DOM_GSEC", n=50)
df_seq.head(5)
from sklearn.cluster import AgglomerativeClustering
import numpy as np
-aac = aa.AAclust(model=AgglomerativeClustering, model_kwargs=dict(linkage="ward"))
+
+aac = aa.AAclust(model=AgglomerativeClustering)
X = np.array(df_scales)
-scales = aac.fit(X, n_clusters=100, names=list(df_scales))
+scales = aac.fit(X, names=list(df_scales), n_clusters=100)
df_scales = df_scales[scales]
-df_scales
+df_scales[scales[0:4]].head(5)
Splits
: These Parts
can be split into
various continuous segments or discontinuous patterns, specified
-sf.get_split_kws()
. - Scales
: Sets of amino acid scales. We
-first use SequenceFeature to obtain Parts and Splits:
-# Feature Engineering
-y = list(df_seq["label"])
+sf.get_split_kws()
. - Scales
: Sets of amino acid scales.
+We use SequenceFeature to obtain Parts and Splits:
+y = list(df_seq["label"])
sf = aa.SequenceFeature()
-df_parts = sf.get_df_parts(df_seq=df_seq, jmd_n_len=10, jmd_c_len=10, list_parts=["tmd_jmd"])
+df_parts = sf.get_df_parts(df_seq=df_seq, list_parts=["tmd_jmd"])
split_kws = sf.get_split_kws(n_split_max=1, split_types=["Segment"])
df_parts.head(5)
@@ -885,9 +404,9 @@ Comparative Physicochemical Profiling (CPP)