diff --git a/aaanalysis/data_loader/__pycache__/data_loader.cpython-39.pyc b/aaanalysis/data_loader/__pycache__/data_loader.cpython-39.pyc index a16d349a..77ec981e 100644 Binary files a/aaanalysis/data_loader/__pycache__/data_loader.cpython-39.pyc and b/aaanalysis/data_loader/__pycache__/data_loader.cpython-39.pyc differ diff --git a/aaanalysis/data_loader/data_loader.py b/aaanalysis/data_loader/data_loader.py index 1b218681..a423c985 100644 --- a/aaanalysis/data_loader/data_loader.py +++ b/aaanalysis/data_loader/data_loader.py @@ -63,8 +63,8 @@ def load_dataset(name: str = "INFO", """ Load protein benchmarking datasets. - The benchmarks are distinguished into amino acid ('AA'), domain ('DOM'), and sequence ('SEQ') level - datasets. Use default settings (name='INFO') of an overview table. Detailed analysis is in :cite:`Breimann23a`. + The benchmarks are categorized into amino acid ('AA'), domain ('DOM'), and sequence ('SEQ') level + datasets. Use default settings (``name='INFO'``) for an overview table. Detailed analysis is in [Breimann23a]_. Parameters ---------- @@ -88,8 +88,8 @@ def load_dataset(name: str = "INFO", Returns ------- - df_seq - Dataframe with the selected sequence dataset. + pd.DataFrame + Dataframe (df_seq) containing the selected sequence dataset. Notes ----- diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle index a64cf533..727bdbc2 100644 Binary files a/docs/build/doctrees/environment.pickle and b/docs/build/doctrees/environment.pickle differ diff --git a/docs/build/doctrees/generated/aaanalysis.AAclust.doctree b/docs/build/doctrees/generated/aaanalysis.AAclust.doctree index 3e7f4b92..7fd62759 100644 Binary files a/docs/build/doctrees/generated/aaanalysis.AAclust.doctree and b/docs/build/doctrees/generated/aaanalysis.AAclust.doctree differ diff --git a/docs/build/doctrees/generated/aaanalysis.CPP.doctree b/docs/build/doctrees/generated/aaanalysis.CPP.doctree index f6668828..3306522b 100644 Binary files a/docs/build/doctrees/generated/aaanalysis.CPP.doctree and b/docs/build/doctrees/generated/aaanalysis.CPP.doctree differ diff --git a/docs/build/doctrees/generated/aaanalysis.CPPPlot.doctree b/docs/build/doctrees/generated/aaanalysis.CPPPlot.doctree index c92f93d3..8a761868 100644 Binary files a/docs/build/doctrees/generated/aaanalysis.CPPPlot.doctree and b/docs/build/doctrees/generated/aaanalysis.CPPPlot.doctree differ diff --git a/docs/build/doctrees/generated/aaanalysis.SequenceFeature.doctree b/docs/build/doctrees/generated/aaanalysis.SequenceFeature.doctree index 1ad1d937..c463fb1a 100644 Binary files a/docs/build/doctrees/generated/aaanalysis.SequenceFeature.doctree and b/docs/build/doctrees/generated/aaanalysis.SequenceFeature.doctree differ diff --git a/docs/build/doctrees/generated/aaanalysis.dPULearn.doctree b/docs/build/doctrees/generated/aaanalysis.dPULearn.doctree index 37b84454..83c7d59d 100644 Binary files a/docs/build/doctrees/generated/aaanalysis.dPULearn.doctree and b/docs/build/doctrees/generated/aaanalysis.dPULearn.doctree differ diff --git a/docs/build/doctrees/generated/aaanalysis.load_dataset.doctree b/docs/build/doctrees/generated/aaanalysis.load_dataset.doctree index 6bf538ef..c8e506a9 100644 Binary files a/docs/build/doctrees/generated/aaanalysis.load_dataset.doctree and b/docs/build/doctrees/generated/aaanalysis.load_dataset.doctree differ diff --git a/docs/build/doctrees/generated/aaanalysis.load_scales.doctree b/docs/build/doctrees/generated/aaanalysis.load_scales.doctree index bce2a1d6..7cb77afa 100644 Binary files a/docs/build/doctrees/generated/aaanalysis.load_scales.doctree and b/docs/build/doctrees/generated/aaanalysis.load_scales.doctree differ diff --git a/docs/build/doctrees/generated/aaanalysis.plot_gcfs.doctree b/docs/build/doctrees/generated/aaanalysis.plot_gcfs.doctree index 7eb49833..584a1de2 100644 Binary files a/docs/build/doctrees/generated/aaanalysis.plot_gcfs.doctree and b/docs/build/doctrees/generated/aaanalysis.plot_gcfs.doctree differ diff --git a/docs/build/doctrees/generated/aaanalysis.plot_get_cdict.doctree b/docs/build/doctrees/generated/aaanalysis.plot_get_cdict.doctree index 68cf2afa..223d6a9a 100644 Binary files a/docs/build/doctrees/generated/aaanalysis.plot_get_cdict.doctree and b/docs/build/doctrees/generated/aaanalysis.plot_get_cdict.doctree differ diff --git a/docs/build/doctrees/generated/aaanalysis.plot_get_cmap.doctree b/docs/build/doctrees/generated/aaanalysis.plot_get_cmap.doctree index f401ddab..e571284f 100644 Binary files a/docs/build/doctrees/generated/aaanalysis.plot_get_cmap.doctree and b/docs/build/doctrees/generated/aaanalysis.plot_get_cmap.doctree differ diff --git a/docs/build/doctrees/generated/aaanalysis.plot_set_legend.doctree b/docs/build/doctrees/generated/aaanalysis.plot_set_legend.doctree index f5d493ac..957b6994 100644 Binary files a/docs/build/doctrees/generated/aaanalysis.plot_set_legend.doctree and b/docs/build/doctrees/generated/aaanalysis.plot_set_legend.doctree differ diff --git a/docs/build/doctrees/generated/aaanalysis.plot_settings.doctree b/docs/build/doctrees/generated/aaanalysis.plot_settings.doctree index a7b4da11..2d611bd2 100644 Binary files a/docs/build/doctrees/generated/aaanalysis.plot_settings.doctree and b/docs/build/doctrees/generated/aaanalysis.plot_settings.doctree differ diff --git a/docs/build/html/.buildinfo b/docs/build/html/.buildinfo index 3bbfcdee..eb42c979 100644 --- a/docs/build/html/.buildinfo +++ b/docs/build/html/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 97d403b4e986b8364b888ee4b33cf00e +config: 58b28218296ef1ff785a2881fa4b8801 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/build/html/_index/badges.html b/docs/build/html/_index/badges.html index 6af97eb0..c2152d89 100644 --- a/docs/build/html/_index/badges.html +++ b/docs/build/html/_index/badges.html @@ -95,7 +95,7 @@
  • - View page source + Edit on GitHub

  • diff --git a/docs/build/html/_index/overview.html b/docs/build/html/_index/overview.html index 49ea238e..a2bd697a 100644 --- a/docs/build/html/_index/overview.html +++ b/docs/build/html/_index/overview.html @@ -95,7 +95,7 @@
  • - View page source + Edit on GitHub

  • diff --git a/docs/build/html/_index/tables.html b/docs/build/html/_index/tables.html index c7bee5ba..11f186a6 100644 --- a/docs/build/html/_index/tables.html +++ b/docs/build/html/_index/tables.html @@ -102,7 +102,7 @@
  • - View page source + Edit on GitHub

  • diff --git a/docs/build/html/_index/usage_principles/data_loading.html b/docs/build/html/_index/usage_principles/data_loading.html index b71c77ac..9bf5b8c9 100644 --- a/docs/build/html/_index/usage_principles/data_loading.html +++ b/docs/build/html/_index/usage_principles/data_loading.html @@ -95,7 +95,7 @@
  • - View page source + Edit on GitHub

  • diff --git a/docs/build/html/_modules/aaanalysis/aaclust/aaclust.html b/docs/build/html/_modules/aaanalysis/aaclust/aaclust.html deleted file mode 100644 index e01099cb..00000000 --- a/docs/build/html/_modules/aaanalysis/aaclust/aaclust.html +++ /dev/null @@ -1,737 +0,0 @@ - - - - - - aaanalysis.aaclust.aaclust — AAanalysis - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - -
    - -
    -
    -
    - -
    -
    -
    -
    - -

    Source code for aaanalysis.aaclust.aaclust

    -"""
    -This is a script for the AAclust clustering wrapper method.
    -"""
    -import pandas as pd
    -import numpy as np
    -from collections import OrderedDict
    -from sklearn.metrics.pairwise import pairwise_distances
    -from sklearn.cluster import KMeans
    -
    -import aaanalysis.utils as ut
    -
    -
    -# I Helper Functions
    -# Obtain centroids and medoids
    -def cluster_center(X):
    -    """Compute cluster center (i.e., arithmetical mean over all data points/observations of a cluster)"""
    -    return X.mean(axis=0)[np.newaxis, :]
    -
    -
    -def get_cluster_centers(X, labels=None):
    -    """Obtain cluster centers and their labels"""
    -    center_labels = list(OrderedDict.fromkeys(labels))
    -    list_masks = [[True if i == label else False for i in labels] for label in center_labels]
    -    centers = np.concatenate([cluster_center(X[mask]) for mask in list_masks]).round(3)
    -    return centers, center_labels
    -
    -
    -def _cluster_medoid(X):
    -    """Obtain cluster medoids (i.e., scale closest to cluster center used as representative scale for a cluster)"""
    -    # Create new array with cluster center and given
    -    center_X = np.concatenate([cluster_center(X), X], axis=0)
    -    # Get index for scale with highest correlation with cluster center
    -    ind_max = np.corrcoef(center_X)[0, 1:].argmax()
    -    return ind_max
    -
    -
    -def get_cluster_medoids(X, labels=None):
    -    """Obtain cluster medoids and their labels"""
    -    unique_labels = list(OrderedDict.fromkeys(labels))
    -    list_masks = [[True if i == label else False for i in labels] for label in unique_labels]
    -    list_ind_max = [_cluster_medoid(X[mask]) for mask in list_masks]
    -    indices = np.array(range(0, len(labels)))
    -    medoid_ind = [indices[m][i] for m, i in zip(list_masks, list_ind_max)]
    -    medoid_labels = [labels[i] for i in medoid_ind]
    -    medoids = np.array([X[i, :] for i in medoid_ind])
    -    return medoids, medoid_labels, medoid_ind
    -
    -
    -# Compute minimum correlation on center or all scales
    -def _min_cor_center(X):
    -    """Get minimum for correlation of all columns with cluster center, defined as the mean values
    -    for each amino acid over all scales."""
    -    # Create new matrix including cluster center
    -    center_X = np.concatenate([cluster_center(X), X], axis=0)
    -    # Get minimum correlation with mean values
    -    min_cor = np.corrcoef(center_X)[0, ].min()
    -    return min_cor
    -
    -
    -def _min_cor_all(X):
    -    """Get minimum for pair-wise correlation of all columns in given matrix."""
    -    # Get minimum correlations minimum/ maximum distance for pair-wise comparisons
    -    min_cor = np.corrcoef(X).min()
    -    return min_cor
    -
    -
    -def get_min_cor(X, labels=None, on_center=True):
    -    """Compute minimum pair-wise correlation or correlation with cluster center for each cluster label
    -    and return minimum of obtained cluster minimums."""
    -    f = _min_cor_center if on_center else _min_cor_all
    -    if labels is None:
    -        return f(X)
    -    # Minimum correlations for each cluster (with center or all scales)
    -    unique_labels = list(OrderedDict.fromkeys(labels))
    -    list_masks = [[True if i == label else False for i in labels] for label in unique_labels]
    -    list_min_cor = [f(X[mask]) for mask in list_masks]
    -    # Minimum for all clusters
    -    min_cor = min(list_min_cor)
    -    return min_cor
    -
    -
    -# Get maximum distance on center or all scales
    -def get_max_dist(X, on_center=True, metric="euclidean"):
    -    """"""
    -    # Maximum distance for cluster
    -    if on_center:
    -        # Create new matrix including cluster center
    -        center_X = np.concatenate([cluster_center(X), X], axis=0)
    -        # Get maximum distance with mean values
    -        max_dist = pairwise_distances(center_X, metric=metric)[0, ].max()
    -    else:
    -        # Get maximum distance for pair-wise comparisons
    -        max_dist = pairwise_distances(X, metric=metric).max()
    -    return max_dist
    -
    -
    -# II Main Functions
    -# AAclust algorithm steps (estimate lower bound for n_clusters -> optimization of n_clusters -> merge clusters)
    -# 1. Step (Estimation of n clusters)
    -def estimate_lower_bound_n_clusters(X, model=None, model_kwargs=None, min_th=0.6, on_center=True):
    -    """
    -    Estimate the lower bound of the number of clusters (k).
    -
    -    This function estimates the lower bound of the number of clusters by testing a range
    -    between 10% and 90% of all observations, incrementing in 10% steps.
    -
    -    Parameters
    -    ----------
    -    X : array-like, shape (n_samples, n_features)
    -        Feature matrix where `n_samples` is the number of samples and `n_features` is the number of features.
    -    model : callable, optional
    -        k-based clustering model to use.
    -    model_kwargs : dict, optional
    -        Dictionary of keyword arguments to pass to the clustering model.
    -    min_th : float, optional, default = 0.6
    -        Minimum threshold of within-cluster Pearson correlation required for a valid clustering.
    -    on_center : bool, optional, default = True
    -        Whether the minimum correlation is computed for all observations within a cluster
    -        or just for the cluster center.
    -
    -    Returns
    -    -------
    -    n_clusters : int
    -        Estimated lower bound for the number of clusters (k).
    -    """
    -    f = lambda c: get_min_cor(X, labels=model(n_clusters=c, **model_kwargs).fit(X).labels_, on_center=on_center)
    -    # Create range between 10% and 90% of all scales (10% steps) as long as minimum correlation is lower than threshold
    -    n_samples, n_features = X.shape
    -    nclust_mincor = [(1, f(1))]
    -    step_number = 40
    -    for i in range(1, step_number, 1):
    -        n_clusters = max(1, int(n_samples*i/step_number))    # n cluster in 2.5% steps
    -        min_cor = f(n_clusters)
    -        if min_cor < min_th:   # Save only lower bounds
    -            nclust_mincor.append((n_clusters, min_cor))
    -        else:
    -            break
    -    # Select second highest lower bound (highest lower bound is faster but might surpass true bound)
    -    nclust_mincor.sort(key=lambda x: x[0], reverse=True)
    -    n_clusters = nclust_mincor[1][0] if len(nclust_mincor) > 1 else nclust_mincor[0][0]  # Otherwise, only existing one
    -    return n_clusters
    -
    -
    -# 2. Step (Optimization of n clusters)
    -def optimize_n_clusters(X, model=None, model_kwargs=None, n_clusters=None, min_th=0.5, on_center=True):
    -    """
    -    Optimize the number of clusters using a recursive algorithm.
    -
    -    This function performs clustering in a recursive manner (through a while loop) to ensure
    -    that the minimum within-cluster correlation is achieved for all clusters. It is an efficiency
    -    optimized version of a step-wise algorithm where the `n_clusters` is incrementally increased
    -    until a stop condition is met.
    -
    -    Parameters
    -    ----------
    -    X : array-like, shape (n_samples, n_features)
    -        Feature matrix where `n_samples` is the number of samples and `n_features` is the number of features.
    -    model : callable, optional
    -        k-based clustering model to use.
    -    model_kwargs : dict, optional
    -        Dictionary of keyword arguments to pass to the clustering model.
    -    n_clusters : int, optional
    -        Estimated number of clusters (k).
    -    min_th : float, optional, default = 0.5
    -        Minimum threshold of within-cluster Pearson correlation required for a valid clustering.
    -    on_center : bool, optional, default = True
    -        Whether the minimum correlation is computed for all observations within a cluster
    -        or just for the cluster center.
    -
    -    Returns
    -    -------
    -    n_clusters : int
    -        Optimized number of clusters (k) after the recursive clustering.
    -    """
    -    n_samples, n_features = X.shape
    -    f = lambda c: get_min_cor(X, labels=model(n_clusters=c, **model_kwargs).fit(X).labels_, on_center=on_center)
    -    min_cor = f(n_clusters)
    -    # Recursive optimization of n_clusters via step wise increase starting from lower bound
    -    step = max(1, min(int(n_samples/10), 5))    # Step size between 1 and 5
    -    while min_cor < min_th and n_clusters < n_samples:    # Stop condition of clustering
    -        n_clusters = min(n_clusters+step, n_samples) # Maximum of of n_samples is allowed
    -        min_cor = f(n_clusters)
    -        # Exceeding of threshold -> Conservative adjustment of clustering parameters to meet true optimum
    -        if min_cor >= min_th and step != 1:
    -            n_clusters = max(1, n_clusters - step * 2)
    -            step = 1
    -            min_cor = f(n_clusters)
    -    return n_clusters
    -
    -
    -# 3. Step (Merging)
    -def _get_min_cor_cluster(X, labels=None, label_cluster=None, on_center=True):
    -    """Get min_cor for single cluster"""
    -    mask = [l == label_cluster for l in labels]
    -    min_cor = get_min_cor(X[mask], on_center=on_center)
    -    return min_cor
    -
    -
    -def _get_quality_measure(X, metric=None, labels=None, label_cluster=None, on_center=True):
    -    """Get quality measure single cluster given by feature matrix X, labels, and label of cluster"""
    -    mask = [l == label_cluster for l in labels]
    -    if metric == ut.METRIC_CORRELATION:
    -        return get_min_cor(X[mask], on_center=on_center)
    -    else:
    -        return get_max_dist(X[mask], on_center=on_center, metric=metric)
    -
    -
    -def _get_best_cluster(dict_clust_qm=None, metric=None):
    -    """Get cluster with best quality measure: either highest minimum Pearson correlation
    -    or lowest distance measure"""
    -    if metric == ut.METRIC_CORRELATION:
    -        return max(dict_clust_qm, key=dict_clust_qm.get)
    -    else:
    -        return min(dict_clust_qm, key=dict_clust_qm.get)
    -
    -
    -def merge_clusters(X, n_max=5, labels=None, min_th=0.5, on_center=True, metric="correlation"):
    -    """
    -    Merge small clusters into other clusters optimizing a given quality measure.
    -
    -    This function merges clusters with sizes less than or equal to `n_max` into other clusters
    -    based on a specified quality measure (Pearson correlation or a distance metric).
    -    Merging is conducted only if the new assignment meets a minimum within-cluster Pearson
    -    correlation threshold defined by `min_th`.
    -
    -    Parameters
    -    ----------
    -    X : array-like, shape (n_samples, n_features)
    -        Feature matrix where `n_samples` is the number of samples and `n_features` is the number of features.
    -    n_max : int, optional, default = 5
    -        Maximum cluster size for small clusters to be considered for merging.
    -    labels : array-like, shape (n_samples,), optional
    -        Initial cluster labels for observations.
    -    min_th : float, optional, default = 0.5
    -        Minimum threshold of within-cluster Pearson correlation required for merging.
    -    on_center : bool, optional, default = True
    -        Whether the minimum correlation is computed for all observations within a cluster
    -        or just for the cluster center.
    -    metric : str, optional, default = 'correlation'
    -        Quality measure used to optimize merging. Can be 'correlation' for maximum correlation
    -        or any valid distance metric like 'euclidean' for minimum distance.
    -
    -    Returns
    -    -------
    -    labels : array-like, shape (n_samples,)
    -        Cluster labels for observations after merging.
    -    """
    -    unique_labels = list(OrderedDict.fromkeys(labels))
    -    for n in range(1, n_max):
    -        s_clusters = [x for x in unique_labels if labels.count(x) == n]   # Smallest clusters
    -        b_clusters = [x for x in unique_labels if labels.count(x) > n]    # Bigger clusters (all others)
    -        # Assign scales from smaller clusters to cluster by optimizing for quality measure
    -        for s_clust in s_clusters:
    -            dict_clust_qm = {}  # Cluster to quality measure
    -            for b_clust in b_clusters:
    -                labels_ = [x if x != s_clust else b_clust for x in labels]
    -                args = dict(labels=labels_, label_cluster=b_clust, on_center=on_center)
    -                min_cor = _get_min_cor_cluster(X, **args)
    -                if min_cor >= min_th:
    -                    dict_clust_qm[b_clust] = _get_quality_measure(X, **args, metric=metric)
    -            if len(dict_clust_qm) > 0:
    -                b_clust_best = _get_best_cluster(dict_clust_qm=dict_clust_qm, metric=metric)
    -                labels = [x if x != s_clust else b_clust_best for x in labels]
    -    # Update labels (cluster labels are given in descending order of cluster size)
    -    sorted_labels = pd.Series(labels).value_counts().index  # sorted in descending order of size
    -    dict_update = {label: i for label, i in zip(sorted_labels, range(0, len(set(labels))))}
    -    labels = [dict_update[label] for label in labels]
    -    return labels
    -
    -
    -# AAclust naming
    -def get_names_cluster(list_names=None, name_medoid=None, name_unclassified="Unclassified"):
    -    """
    -    Get list of cluster names sorted based on following criteria (descending order):
    -        a) Frequency of term (most frequent term is preferred)
    -        b) Term is the name or a sub-name of the given medoid
    -        c) Length of term (shorter terms are preferred)
    -    If cluster consists of only one term, the name will be 'unclassified ('category name')'
    -    """
    -    def remove_2nd_info(name_):
    -        """Remove information given behind comma"""
    -        if "," in name_:
    -            name_ = name_.split(",")[0]
    -            if "(" in name_:
    -                name_ += ")"  # Close parenthesis if interpreted by deletion
    -        return name_
    -    # Filter categories (Remove unclassified scales and secondary infos)
    -    list_names = [remove_2nd_info(x) for x in list_names if "Unclassified" not in x]
    -    # Create list of shorter names not containing information given in parenthesis
    -    list_short_names = [x.split(" (")[0] for x in list_names if " (" in x]
    -    if len(list_names) > 1:
    -        list_names.extend(list_short_names)
    -        # Obtain information to check criteria for sorting scale names
    -        df_counts = pd.Series(list_names).value_counts().to_frame().reset_index()   # Compute frequencies of names
    -        df_counts.columns = ["name", "count"]
    -        df_counts["medoid"] = [True if x in name_medoid else False for x in df_counts["name"]]  # Name in medoid
    -        df_counts["length"] = [len(x) for x in df_counts["name"]]      # Length of name
    -        # Sort names based on given criteria
    -        df_counts = df_counts.sort_values(by=["count", "medoid", "length"], ascending=[False, False, True])
    -        names_cluster = df_counts["name"].tolist()
    -    else:
    -        names_cluster = [name_unclassified]
    -    return names_cluster
    -
    -
    -
    [docs]class AAclust: - """ - AAclust: A k-optimized clustering framework for selecting redundancy-reduced set of numerical scales. - - AAclust is designed primarily for amino acid scales but is versatile enough for any set of numerical indices. - It takes clustering models that require a pre-defined number of clusters (k) from - `scikit-learn <https://scikit-learn.org/stable/modules/clustering.html>`. By leveraging Pearson correlation as - similarity measure, AAclust optimizes the value of k. It then selects one representative sample (termed as 'medoid') - for each cluster, which is the closest to the cluster's center, yielding a redundancy-reduced sample set. - - Parameters - ---------- - model : callable, optional, default = :class:`sklearn.cluster.KMeans` - The employed clustering model requiring pre-defined number of clusters 'k', given as 'n_clusters' parameter. - model_kwargs : dict, optional, default = {} - A dictionary of keyword arguments to pass to the selected clustering model. - - verbose : bool, optional, default = False - A flag to enable or disable verbose outputs. - - Attributes - ---------- - n_clusters : int, default = None - Number of clusters obtained by AAclust. - labels_ : array-like, default = None - Cluster labels in the order of samples in the feature matrix. - centers_ : array-like, default = None - Average scale values corresponding to each cluster. - center_labels_ : array-like, default = None - Cluster labels for each cluster center. - medoids_ : array-like, default = None - Representative samples (one for each cluster center). - medoid_labels_ : array-like, default = None - Cluster labels for each medoid. - medoid_ind_ : array-like, default = None - Indices of the chosen medoids within the original dataset. - """ -
    [docs] def __init__(self, model=None, model_kwargs=None, verbose=False): - # Model parameters - if model is None: - model = KMeans - self.model = model - if model_kwargs is None: - model_kwargs = dict() - model_kwargs = ut.check_model(model=self.model, model_kwargs=model_kwargs) - self._model_kwargs = model_kwargs - # AAclust clustering settings - self._verbose = verbose - # Output parameters (will be set during model fitting) - self.n_clusters = None # Number of by AAclust obtained clusters - self.labels_ = None # Cluster labels in order of samples in feature matrix - self.centers_ = None # Mean scales for each cluster - self.center_labels_ = None - self.medoids_ = None - self.medoid_labels_ = None - self.medoid_ind_ = None
    - - # Clustering method -
    [docs] def fit(self, X, names=None, on_center=True, min_th=0, merge_metric="euclidean", n_clusters=None): - """ - Fit the AAclust model on the data, optimizing cluster formation using Pearson correlation. - - AAclust determines the optimal number of clusters, k, without pre-specification. It partitions data(X) into - clusters by maximizing the within-cluster Pearson correlation beyond the 'min_th' threshold. The quality of - clustering is either based on the minimum Pearson correlation of all members ('min_cor all') or between - the cluster center and its members ('min_cor center'), governed by `on_center`. - - The clustering undergoes three stages: - 1. Estimate the lower bound of k. - 2. Refine k using the chosen quality metric. - 3. Optionally merge smaller clusters, as directed by `merge_metric`. - - Finally, a representative scale (medoid) 'closest to each cluster center is chosen for redundancy reduction. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Feature matrix where `n_samples` is the number of samples and `n_features` is the number of features. - names : list of str, optional - Sample names. If provided, returns names of the medoids. - on_center : bool, default = True - If True, the correlation threshold is applied to the cluster center. Otherwise, it's applied to all cluster members. - min_th : float, default = 0 - Pearson correlation threshold for clustering (between 0 and 1). - merge_metric : str or None, default = "euclidean" - Metric used for optional cluster merging. Can be "euclidean", "pearson", or None (no merging). - n_clusters : int, optional - Pre-defined number of clusters. If provided, AAclust uses this instead of optimizing k. - - Returns - ------- - names_medoid : list of str, if `names` is provided - Names of the medoids. - - Notes - ----- - The 'fit' method sets the following attributes: :attr: `aaanalysis.AAclust.n_clusters", - :attr: `aaanalysis.AAclust.labels_`, :attr: `aaanalysis.AAclust.centers_`, - :attr: `aaanalysis.AAclust.center_labels_`, :attr: `aaanalysis.AAclust.medoids_`. - :attr: `aaanalysis.AAclust.medoid_labels_`, :attr: `aaanalysis.AAclust.medoid_ind_`. - - For further information, refer to the AAclust paper : TODO: add link to AAclust paper - """ - # Check input - ut.check_min_th(min_th=min_th) - merge_metric = ut.check_merge_metric(merge_metric=merge_metric) - X, names = ut.check_feat_matrix(X=X, names=names) - args = dict(model=self.model, model_kwargs=self._model_kwargs, min_th=min_th, on_center=on_center) - # Clustering using given clustering models - if n_clusters is not None: - labels = self.model(n_clusters=n_clusters, **self._model_kwargs).fit(X).labels_.tolist() - # Clustering using AAclust algorithm - else: - # Estimation of lower bound of number of clusters via testing range between 10% and 90% of all scales - if self._verbose: - print("1. Estimation of lower bound of k (number of clusters)", end="") - n_clusters_lb = estimate_lower_bound_n_clusters(X, **args) - if self._verbose: - print(f": k={n_clusters_lb}") - # Optimization of number of clusters by recursive clustering - if self._verbose: - objective_fct = "min_cor_center" if on_center else "min_cor_all" - print(f"2. Optimization of k by recursive clustering ({objective_fct}, min_th={min_th})", end="") - n_clusters = optimize_n_clusters(X, n_clusters=n_clusters_lb, **args) - if self._verbose: - print(f": k={n_clusters}") - labels = self.model(n_clusters=n_clusters, **self._model_kwargs).fit(X).labels_.tolist() - # Cluster merging: assign scales from small clusters to other cluster with highest minimum correlation - if merge_metric is not None: - if self._verbose: - print("3. Cluster merging (optional)", end="") - labels = merge_clusters(X, labels=labels, min_th=min_th, on_center=on_center, metric=merge_metric) - if self._verbose: - print(f": k={len(set(labels))}") - # Obtain cluster centers and medoids - medoids, medoid_labels, medoid_ind = get_cluster_medoids(X, labels=labels) - centers, center_labels = get_cluster_centers(X, labels=labels) - # Save results - self.n_clusters = len(set(labels)) - self.labels_ = np.array(labels) - self.centers_ = centers - self.center_labels_ = center_labels - self.medoids_ = medoids # Representative scales - self.medoid_labels_ = medoid_labels - self.medoid_ind_ = medoid_ind # Index of medoids - # Return labels of medoid if y is given - if names is not None: - names_medoid = [names[i] for i in medoid_ind] - return names_medoid
    - -
    [docs] def cluster_naming(self, names=None, labels=None, name_unclassified="Unclassified"): - """ - Assigns names to clusters based on scale names and their frequency. - - This method renames clusters based on the names of the scales in each cluster, with priority given to the - most frequent scales. If the name is already used or does not exist, it defaults to 'name_unclassified'. - - Parameters - ---------- - names : list, optional - List of scale names corresponding to each sample. - labels : list, optional - Cluster labels. If not provided, uses the labels from the fitted model. - name_unclassified : str, default = "Unclassified" - Name assigned to clusters that cannot be classified with the given names. - Returns - ------- - cluster_names : list - A list of renamed clusters based on scale names. - """ - if type(names) is not list: - raise ValueError("'names' must be list") - if labels is None: - labels = self.labels_ - dict_medoids = dict(zip(self.medoid_labels_, self.medoid_ind_)) - # Get cluster labels sorted in descending order of frequency - labels_sorted = pd.Series(labels).value_counts().index - # Assign names to cluster - dict_cluster_names = {} - for clust in labels_sorted: - name_medoid = names[dict_medoids[clust]] - list_names = [names[i] for i in range(0, len(names)) if labels[i] == clust] - names_cluster = get_names_cluster(list_names=list_names, - name_medoid=name_medoid, - name_unclassified=name_unclassified) - assigned = False - for name in names_cluster: - if name not in dict_cluster_names.values() or name == name_unclassified: - dict_cluster_names[clust] = name - assigned = True - break - if not assigned: - dict_cluster_names[clust] = name_unclassified - cluster_names = [dict_cluster_names[label] for label in labels] - return cluster_names
    - -
    [docs] @staticmethod - def get_cluster_centers(X, labels=None): - """ - Computes the center of each cluster based on the given labels. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Feature matrix where `n_samples` is the number of samples and `n_features` is the number of features. - labels : list or array-like, optional - Cluster labels for each sample in X. - - Returns - ------- - centers : array-like - The computed center for each cluster. - center_labels : array-like - The labels associated with each computed center. - """ - centers, center_labels = get_cluster_centers(X, labels=labels) - return centers, center_labels
    - -
    [docs] @staticmethod - def get_cluster_medoids(X, labels=None): - """ - Computes the medoid of each cluster based on the given labels. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Feature matrix where `n_samples` is the number of samples and `n_features` is the number of features. - labels : list or array-like, optional - Cluster labels for each sample in X. - - Returns - ------- - medoids : array-like - The medoid for each cluster. - medoid_labels : array-like - The labels corresponding to each medoid. - medoid_ind : array-like - Indexes of medoids within the original data. - """ - medoids, medoid_labels, medoid_ind = get_cluster_medoids(X, labels=labels) - return medoids, medoid_labels, medoid_ind
    - -
    [docs] @staticmethod - def correlation(X_test, X_ref, labels_test=None, labels_ref=None, n=3, positive=True, - on_center=False, except_unclassified=True): - """ - Computes the correlation of test data with reference cluster centers. - - Parameters - ---------- - X_test : array-like - Test feature matrix. - X_ref : array-like - Reference feature matrix. - labels_test : list or array-like, optional - Cluster labels for the test data. - labels_ref : list or array-like, optional - Cluster labels for the reference data. - n : int, default = 3 - Number of top centers to consider based on correlation strength. - positive : bool, default = True - If True, considers positive correlations. Else, negative correlations. - on_center : bool, default = False - If True, correlation is computed with cluster centers. Otherwise, with all cluster members. - except_unclassified : bool, default = True - If True, excludes 'unclassified' clusters from the reference list. - - Returns - ------- - list_top_center_name_corr : list of str - Names and correlations of centers having strongest (positive/negative) correlation with test data samples. - """ - # Check input - X_test, labels_test = ut.check_feat_matrix(X=X_test, names=labels_test) - X_ref, labels_ref = ut.check_feat_matrix(X=X_ref, names=labels_ref) - if except_unclassified: - names_ref = list(dict.fromkeys(labels_ref)) - else: - names_ref = [x for x in list(dict.fromkeys(labels_ref)) if "unclassified" not in x.lower()] - masks_ref = [[True if i == label else False for i in labels_ref] for label in names_ref] - if on_center: - # Get centers for all clusters in reference data - centers = np.concatenate([cluster_center(X_ref[mask]) for mask in masks_ref], axis=0) - # Compute correlation of test data with centers - Xtest_centers = np.concatenate([X_test, centers], axis=0) - n_test = X_test.shape[0] - X_corr = np.corrcoef(Xtest_centers)[:n_test, n_test:] - else: - masks_test = [[True if i == j else False for j in range(0, len(labels_test))] - for i, _ in enumerate(labels_test)] - # Compute minimum correlation of test data with each group of reference data - X_corr = np.array([[_min_cor_all(np.concatenate([X_test[mask_test], X_ref[mask_ref]], axis=0)) - for mask_ref in masks_ref ] for mask_test in masks_test]) - # Get index for n centers with highest/lowest correlation for each scale - if positive: - list_top_center_ind = X_corr.argsort()[:, -n:][:, ::-1] - else: - list_top_center_ind = X_corr.argsort()[:, :n] - # Get name and correlation for centers correlating strongest (positive/negative) with test data samples - list_top_center_name_corr = [] - for i, ind in enumerate(list_top_center_ind): - top_corr = X_corr[i, :][ind] - top_names = [names_ref[x] for x in ind] - str_corr = ";".join([f"{name} ({round(corr, 3)})" for name, corr in zip(top_names, top_corr)]) - list_top_center_name_corr.append(str_corr) - return list_top_center_name_corr
    - -
    [docs] def eval(self): - """"""
    - # TODO add evaluation function -
    - -
    -
    - -
    -
    -
    -
    - - - - \ No newline at end of file diff --git a/docs/build/html/_modules/aaanalysis/cpp/cpp.html b/docs/build/html/_modules/aaanalysis/cpp/cpp.html deleted file mode 100644 index 2aaced32..00000000 --- a/docs/build/html/_modules/aaanalysis/cpp/cpp.html +++ /dev/null @@ -1,443 +0,0 @@ - - - - - - aaanalysis.cpp.cpp — AAanalysis - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - -
    - -
    -
    -
    - -
    -
    -
    -
    - -

    Source code for aaanalysis.cpp.cpp

    -"""
    -This is a script for ...
    -"""
    -import pandas as pd
    -
    -from aaanalysis.cpp.feature import SequenceFeature
    -from aaanalysis.cpp._feature_stat import SequenceFeatureStatistics
    -
    -import aaanalysis as aa
    -import aaanalysis.utils as ut
    -from aaanalysis.template_classes import Tool
    -
    -# I Helper Functions
    -
    -
    -# Filtering functions
    -def _filtering_info(df=None, df_scales=None, check_cat=True):
    -    """Get datasets structures for filtering, two dictionaries with feature to scales category resp.
    -    feature positions and one datasets frame with paired pearson correlations of all scales"""
    -    if check_cat:
    -        dict_c = dict(zip(df[ut.COL_FEATURE], df["category"]))
    -    else:
    -        dict_c = dict()
    -    dict_p = dict(zip(df[ut.COL_FEATURE], [set(x) for x in df["positions"]]))
    -    df_cor = df_scales.corr()
    -    return dict_c, dict_p, df_cor
    -
    -
    -# TODO simplify checks & interface (end-to-end check with tests & docu)
    -
    -# II Main Functions
    -
    [docs]class CPP(Tool): - """ - Create and filter features that are most discriminant between two sets of sequences. - - Parameters - ---------- - df_scales : :class:`pandas.DataFrame` - DataFrame with amino acid scales. - df_cat : :class:`pandas.DataFrame`, default = aa.load_categories - DataFrame with default categories for physicochemical amino acid scales. - df_parts : :class:`pandas.DataFrame` - DataFrame with sequence parts. - split_kws : dict, default = SequenceFeature.get_split_kws - Nested dictionary with parameter dictionary for each chosen split_type. - accept_gaps : bool, default = False - Whether to accept missing values by enabling omitting for computations (if True). - - verbose : bool, default = True - Whether to print progress information about the algorithm (if True). - - Notes - ----- - The CPP.run() method performs all steps of the CPP algorithm. - - """ -
    [docs] def __init__(self, df_scales=None, df_cat=None, df_parts=None, split_kws=None, - accept_gaps=False, verbose=True): - # Load default scales if not specified - sf = SequenceFeature() - if df_cat is None: - df_cat = aa.load_scales(name=ut.STR_SCALE_CAT) - if df_scales is None: - df_scales = aa.load_scales() - if split_kws is None: - split_kws = sf.get_split_kws() - ut.check_bool(name="verbose", val=verbose) - ut.check_df_parts(df_parts=df_parts, verbose=verbose) - df_parts = ut.check_df_scales(df_scales=df_scales, df_parts=df_parts, accept_gaps=accept_gaps) - df_cat, df_scales = ut.check_df_cat(df_cat=df_cat, df_scales=df_scales, verbose=verbose) - ut.check_split_kws(split_kws=split_kws) - self._verbose = verbose - self._accept_gaps = accept_gaps - # Feature components: Scales + Part + Split - self.df_cat = df_cat.copy() - self.df_scales = df_scales.copy() - self.df_parts = df_parts.copy() - self.split_kws = split_kws
    - - # Adder methods for CPP analysis (used in run method) - def _add_scale_info(self, df_feat=None): - """ - Add scale information to DataFrame. Scale information are–from general to specific–scale categories, - sub categories, and scale names. - - Parameters - ---------- - df_feat: :class:`pandas.DataFrame` - Feature DataFrame to add scale categories. - - Returns - ------- - df_feat: :class:`pandas.DataFrame` - Feature DataFrame including scale categories. - """ - # Check input - df_feat = ut.check_df_feat(df_feat=df_feat) - - # Add scale categories - df_cat = self.df_cat.copy() - i = df_feat.columns.get_loc(ut.COL_FEATURE) - for col in [ut.COL_SCALE_DES, ut.COL_SCALE_NAME, ut.COL_SUBCAT, ut.COL_CAT]: - if col in list(df_feat): - df_feat.drop(col, inplace=True, axis=1) - dict_cat = dict(zip(df_cat[ut.COL_SCALE_ID], df_cat[col])) - vals = [dict_cat[s.split("-")[2]] for s in df_feat[ut.COL_FEATURE]] - df_feat.insert(i + 1, col, vals) - return df_feat - - def _add_stat(self, df_feat=None, labels=None, parametric=False, accept_gaps=False): - """ - Add summary statistics for each feature to DataFrame. - - Parameters - ---------- - df_feat: :class:`pandas.DataFrame` - Feature DataFrame to add statistics. - labels: array-like, shape (n_samples) - Class labels for samples in df_parts attribute. - parametric: bool, default = False - Whether to use parametric (T-test) or non-parametric (U-test) test for p-value computation. - accept_gaps: bool, default = False - Whether to accept missing values by enabling omitting for computations (if True). - - Returns - ------- - df_feat: :class:`pandas.DataFrame` - Feature DataFrame including statistics for comparing two given groups. - - Notes - ----- - P-values are calculated Mann-Whitney U test (non-parametric) or T-test (parametric) as implemented in SciPy. - - For multiple hypothesis correction, the Benjamini-Hochberg FDR correction is applied on all given features - as implemented in SciPy. - """ - # Check input - df_feat = ut.check_df_feat(df_feat=df_feat) - ut.check_labels(labels=labels, df=self.df_parts, name_df="df_parts") - ut.check_bool(name="parametric", val=parametric) - - # Add feature statistics - features = list(df_feat[ut.COL_FEATURE]) - sf = SequenceFeature() - sfs = SequenceFeatureStatistics() - X = sf.feat_matrix(df_parts=self.df_parts, - features=features, - df_scales=self.df_scales, - accept_gaps=accept_gaps) - df_feat = sfs.add_stat(df=df_feat, X=X, y=labels, parametric=parametric) - return df_feat - - @staticmethod - def _add_positions(df_feat=None, tmd_len=20, jmd_n_len=10, jmd_c_len=10, ext_len=4, start=1): - """Add sequence positions to DataFrame.""" - # Check input (length checked by SequenceFeaturePositions) - df_feat = ut.check_df_feat(df_feat=df_feat) - # Add positions of features - features = df_feat[ut.COL_FEATURE].to_list() - sf = SequenceFeature() - feat_positions = sf.add_position(features=features, tmd_len=tmd_len, start=start, - jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len, ext_len=ext_len) - df_feat[ut.COL_POSITION] = feat_positions - return df_feat - - # Filtering methods - @staticmethod - def _pre_filtering(features=None, abs_mean_dif=None, std_test=None, max_std_test=0.2, n=10000): - """CPP pre-filtering based on thresholds.""" - df = pd.DataFrame(zip(features, abs_mean_dif, std_test), - columns=[ut.COL_FEATURE, ut.COL_ABS_MEAN_DIF, ut.COL_STD_TEST]) - df = df[df[ut.COL_STD_TEST] <= max_std_test] - df = df.sort_values(by=ut.COL_ABS_MEAN_DIF, ascending=False).head(n) - return df - - def _filtering(self, df=None, max_overlap=0.5, max_cor=0.5, n_filter=100, check_cat=True): - """CPP filtering algorithm based on redundancy reduction in descending order of absolute AUC.""" - dict_c, dict_p, df_cor = _filtering_info(df=df, df_scales=self.df_scales, check_cat=check_cat) - df = df.sort_values(by=[ut.COL_ABS_AUC, ut.COL_ABS_MEAN_DIF], ascending=False).copy().reset_index(drop=True) - list_feat = list(df[ut.COL_FEATURE]) - list_top_feat = [list_feat.pop(0)] # List with best feature - for feat in list_feat: - add_flag = True - # Stop condition for limit - if len(list_top_feat) == n_filter: - break - # Compare features with all top features (added if low overlap & weak correlation or different category) - for top_feat in list_top_feat: - if not check_cat or dict_c[feat] == dict_c[top_feat]: - # Remove if feat positions high overlap or subset - pos, top_pos = dict_p[feat], dict_p[top_feat] - overlap = len(top_pos.intersection(pos))/len(top_pos.union(pos)) - if overlap >= max_overlap or pos.issubset(top_pos): - # Remove if high pearson correlation - scale, top_scale = feat.split("-")[2], top_feat.split("-")[2] - cor = df_cor[top_scale][scale] - if cor > max_cor: - add_flag = False - if add_flag: - list_top_feat.append(feat) - df_top_feat = df[df[ut.COL_FEATURE].isin(list_top_feat)] - return df_top_feat - - # Main method -
    [docs] def run(self, labels=None, parametric=False, n_filter=100, - tmd_len=20, jmd_n_len=10, jmd_c_len=10, ext_len=4, start=1, - check_cat=True, n_pre_filter=None, pct_pre_filter=5, max_std_test=0.2, max_overlap=0.5, max_cor=0.5, - n_processes=None): - """ - Perform CPP pipeline by creation and two-step filtering of features. CPP aims to - identify a collection of non-redundant features that are most discriminant between - a test and a reference group of sequences. - - Parameters - ---------- - labels : array-like, shape (n_samples) - Class labels for samples in sequence DataFrame (test=1, reference=0). - parametric : bool, default = False - Whether to use parametric (T-test) or non-parametric (U-test) test for p-value computation. - n_filter : int, default = 100 - Number of features to be filtered/selected by CPP algorithm. - n_pre_filter : int, optional - Number of feature to be pre-filtered by CPP algorithm. If None, a percentage of all features is used. - tmd_len : int, >0 - Length of TMD used for positions. TODO add link to explanation - start : int, >=0 - Position label of first amino acid position (starting at N-terminus). - jmd_n_len : int, >=0, default = 10 - Length of JMD-N. - jmd_c_len : int, >=0, default = 10 - Length of JMD-C. - ext_len : int, >=0, default = 4 - Length of TMD-extending part (starting from C and N terminal part of TMD). - Should be longer than jmd_n_len and jmd_c_len - check_cat : bool, default = True - Whether to check for redundancy within scale categories. - pct_pre_filter : int, default = 5 - Percentage of all features that should remain after the pre-filtering step. - max_std_test : float [0-1], default = 0.2 - Maximum standard deviation within the test group used as threshold for pre-filtering. - max_overlap : float [0-1], default = 0.5 - Maximum positional overlap of features used as threshold for filtering. - max_cor : float [0-1], default = 0.5 - Maximum Pearson correlation of features used as threshold for filtering. - n_processes : int, default = None - Number of CPUs used for multiprocessing. If None, number will be optimized automatically. - - Returns - ------- - df_feat : :class:`pandas.DataFrame`, shape (n_feature, n_feature_information) - DataFrame with a unique identifier, scale information, statistics, and positions for each feature. - - Notes - ----- - The feature DataFrame contains the following eleven columns, including the unique - feature id (1), scale information (2-4), statistical results used for filtering and - ranking (5-10), and feature positions (11): - - 1. features: Feature ID (PART-SPLIT-SCALE) - 2. category: Scale category - 3. subcategory: Sub category of scales - 4. scale_name: Name of scales - 5. abs_auc: Absolute adjusted AUC [-0.5 to 0.5] - 6. abs_mean_dif: Absolute mean differences between test and reference group [0 to 1] - 7. std_test: Standard deviation in test group - 8. std_ref: Standard deviation in reference group - 9. p_val: Non-parametric (mann_whitney) or parametric (ttest_indep) statistic - 10. p_val_fdr_bh: Benjamini-Hochberg FDR corrected p-values - 11. positions: Feature positions for default settings - """ - # Check input - ut.check_labels(labels=labels, df=self.df_parts, name_df="df_parts") - ut.check_args_len(tmd_len=tmd_len, jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len, ext_len=ext_len) - ut.check_non_negative_number(name="n_filter", val=n_filter, min_val=1) - ut.check_non_negative_number(name="n_pre_filter", val=n_pre_filter, min_val=1, accept_none=True) - ut.check_non_negative_number(name="pct_pre_filter", val=pct_pre_filter, min_val=5, max_val=100) - ut.check_non_negative_number(name="max_std_test", val=max_std_test, min_val=0.0, max_val=1.0, just_int=False) - ut.check_non_negative_number(name="max_overlap", val=max_overlap, min_val=0.0, max_val=1.0, just_int=False) - ut.check_bool(name="verbose", val=self._verbose) - # Settings and creation of objects - args = dict(split_kws=self.split_kws, df_scales=self.df_scales) - if self._verbose: - sf = SequenceFeature() - n_feat = len(sf.get_features(**args, list_parts=list(self.df_parts))) - print(f"1. CPP creates {n_feat} features for {len(self.df_parts)} samples") - ut.print_start_progress() - # Pre-filtering: Select best n % of feature (filter_pct) based std(test set) and mean_dif - sfs = SequenceFeatureStatistics() - abs_mean_dif, std_test, features = sfs.pre_filtering_info(**args, - df_parts=self.df_parts, - y=labels, - accept_gaps=self._accept_gaps, - verbose=self._verbose, - n_processes=n_processes) - if n_pre_filter is None: - n_pre_filter = int(len(features) * (pct_pre_filter / 100)) - if self._verbose: - ut.print_finished_progress() - print(f"2. CPP pre-filters {n_pre_filter} features ({pct_pre_filter}%) with highest '{ut.COL_ABS_MEAN_DIF}'" - f" and 'max_std_test' <= {max_std_test}") - df = self._pre_filtering(features=features, - abs_mean_dif=abs_mean_dif, - std_test=std_test, - n=n_pre_filter, - max_std_test=max_std_test) - # Filtering using CPP algorithm - df = self._add_stat(df_feat=df, labels=labels, parametric=parametric, accept_gaps=self._accept_gaps) - if self._verbose: - print(f"3. CPP filtering algorithm") - df = self._add_positions(df_feat=df, tmd_len=tmd_len, start=start) - df = self._add_scale_info(df_feat=df) - df_feat = self._filtering(df=df, n_filter=n_filter, check_cat=check_cat, max_overlap=max_overlap, max_cor=max_cor) - df_feat.reset_index(drop=True, inplace=True) - if self._verbose: - print(f"4. CPP returns df with {len(df_feat)} unique features including general information and statistics") - return df_feat
    - -
    [docs] @staticmethod - def eval(df_feat=None, features=None): - """Get evaluation for provided dataset"""
    - # TODO get evaluation for any dataset for compelete -
    - -
    -
    - -
    -
    -
    -
    - - - - \ No newline at end of file diff --git a/docs/build/html/_modules/aaanalysis/cpp/cpp_plot.html b/docs/build/html/_modules/aaanalysis/cpp/cpp_plot.html deleted file mode 100644 index 5278f41c..00000000 --- a/docs/build/html/_modules/aaanalysis/cpp/cpp_plot.html +++ /dev/null @@ -1,777 +0,0 @@ - - - - - - aaanalysis.cpp.cpp_plot — AAanalysis - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - -
    - -
    -
    -
    - -
    -
    -
    -
    - -

    Source code for aaanalysis.cpp.cpp_plot

    -"""
    -This is a script for ...
    -"""
    -import pandas as pd
    -import matplotlib.pyplot as plt
    -import seaborn as sns
    -import inspect
    -
    -import aaanalysis
    -from aaanalysis.cpp._cpp import CPPPlots, get_optimal_fontsize
    -
    -import aaanalysis.utils as ut
    -
    -# Settings
    -pd.set_option('expand_frame_repr', False)  # Single line print for pd.Dataframe
    -
    -# TODO simplify checks & interface (end-to-end check with tests & docu)
    -# TODO plot_functions test & refactor (end-to-end)
    -
    -
    -# I Helper Functions
    -def check_value_type(val_type=None, count_in=True):
    -    """Check if value type is valid"""
    -    list_value_type = ["mean", "sum", "std"]
    -    if count_in:
    -        list_value_type.append("count")
    -    if val_type not in list_value_type:
    -        raise ValueError(f"'val_type' ('{val_type}') should be on of following: {list_value_type}")
    -
    -
    -def check_normalize(normalize=True):
    -    """Check normalize parameter"""
    -    if not (type(normalize) == bool or normalize in ["positions", "positions_only"]):
    -        raise ValueError(f"'normalize' ('{normalize}') should be bool or, if normalized for positions, 'positions'.")
    -    normalize_for_positions = False if type(normalize) is bool else "positions" in normalize
    -    normalize = normalize if type(normalize) is bool else "positions" == normalize
    -    return normalize, normalize_for_positions
    -
    -
    -# Check for plotting methods
    -def check_args_size(seq_size=None, tmd_jmd_fontsize=None):
    -    """Check if sequence size parameters match"""
    -    ut.check_non_negative_number(name="seq_size", val=seq_size, min_val=0, accept_none=True, just_int=False)
    -    ut.check_non_negative_number(name="tmd_jmd_fontsize", val=tmd_jmd_fontsize, min_val=0, accept_none=True, just_int=False)
    -    args_size = dict(seq_size=seq_size, tmd_jmd_fontsize=tmd_jmd_fontsize)
    -    return args_size
    -
    -
    -def check_args_xtick(xtick_size=None, xtick_width=None, xtick_length=None):
    -    """Check if x tick parameters non-negative float"""
    -    args = dict(accept_none=True, just_int=False, min_val=0)
    -    ut.check_non_negative_number(name="xtick_size", val=xtick_size, **args)
    -    ut.check_non_negative_number(name="xtick_width", val=xtick_width, **args)
    -    ut.check_non_negative_number(name="xtick_length", val=xtick_length, **args)
    -    args_xtick = dict(xtick_size=xtick_size, xtick_width=xtick_width, xtick_length=xtick_length)
    -    return args_xtick
    -
    -
    -def check_args_ytick(ytick_size=None, ytick_width=None, ytick_length=None):
    -    """Check if y tick parameters non-negative float"""
    -    args = dict(accept_none=True, just_int=False, min_val=1)
    -    ut.check_non_negative_number(name="ytick_size", val=ytick_size, **args)
    -    ut.check_non_negative_number(name="ytick_width", val=ytick_width, **args)
    -    ut.check_non_negative_number(name="ytick_length", val=ytick_length, **args)
    -    args_ytick = dict(ytick_size=ytick_size, ytick_width=ytick_width, ytick_length=ytick_length)
    -    return args_ytick
    -
    -
    -def check_part_color(tmd_color=None, jmd_color=None):
    -    """Check if part colors valid"""
    -    ut.check_color(name="tmd_color", val=tmd_color)
    -    ut.check_color(name="jmd_color", val=jmd_color)
    -    args_part_color = dict(tmd_color=tmd_color, jmd_color=jmd_color)
    -    return args_part_color
    -
    -
    -def check_seq_color(tmd_seq_color=None, jmd_seq_color=None):
    -    """Check sequence colors"""
    -    ut.check_color(name="tmd_seq_color", val=tmd_seq_color)
    -    ut.check_color(name="jmd_seq_color", val=jmd_seq_color)
    -    args_seq_color = dict(tmd_seq_color=tmd_seq_color, jmd_seq_color=jmd_seq_color)
    -    return args_seq_color
    -
    -
    -def check_figsize(figsize=None):
    -    """"""
    -    ut.check_tuple(name="figsize", val=figsize, n=2)
    -    ut.check_non_negative_number(name="figsize:width", val=figsize[0], min_val=1, just_int=False)
    -    ut.check_non_negative_number(name="figsize:height", val=figsize[1], min_val=1, just_int=False)
    -
    -
    -def check_dict_color(dict_color=None, df_cat=None):
    -    """Check if color dictionary is matching to DataFrame with categories"""
    -    list_cats = list(sorted(set(df_cat[ut.COL_CAT])))
    -    if dict_color is None:
    -        dict_color = ut.DICT_COLOR
    -    if not isinstance(dict_color, dict):
    -        raise ValueError(f"'dict_color' should be a dictionary with colors for: {list_cats}")
    -    list_cat_not_in_dict_cat = [x for x in list_cats if x not in dict_color]
    -    if len(list_cat_not_in_dict_cat) > 0:
    -        error = f"'dict_color' not complete! Following categories are missing from 'df_cat': {list_cat_not_in_dict_cat}"
    -        raise ValueError(error)
    -    for key in dict_color:
    -        color = dict_color[key]
    -        ut.check_color(name=key, val=color)
    -    return dict_color
    -
    -
    -def check_parameters(func=None, name_called_func=None, e=None):
    -    """Check parameters string from error message of third party packages"""
    -    list_arg_str = ["property ", "attribute ", "argument ", "parameter "]
    -    str_error = ""
    -    for arg_str in list_arg_str:
    -        if arg_str in str(e):
    -            error_arg = str(e).split(arg_str)[1]
    -            str_error += "Error due to {} parameter. ".format(error_arg)
    -            break
    -    args = [x for x in inspect.getfullargspec(func).args if x != "self"]
    -    str_error += "Arguments are allowed from {} and as follows: {}".format(name_called_func, args)
    -    return str_error
    -
    -
    -# Check heatmap plotting
    -def check_vmin_vmax(vmin=None, vmax=None):
    -    """Check if number of cmap colors is valid with given value range"""
    -    ut.check_float(name="vmin", val=vmin, accept_none=True, just_float=False)
    -    ut.check_float(name="vmax", val=vmax, accept_none=True, just_float=False)
    -    if vmin is not None and vmax is not None and vmin >= vmax:
    -        raise ValueError(f"'vmin' ({vmin}) < 'vmax' ({vmax}) not fulfilled.")
    -
    -
    -# Check barplot and profile
    -def check_grid_axis(grid_axis=None):
    -    """"""
    -    list_valid = ["x", 'y', 'both']
    -    if grid_axis not in list_valid:
    -        raise ValueError(f"'grid_axis' ('{grid_axis}') not valid. Chose from following: {list_valid}")
    -
    -
    -# Check stat plot
    -def check_ylabel_fontweight(ylabel_fontweight=None, accept_none=True):
    -    """"""
    -    if accept_none and ylabel_fontweight is None:
    -        return
    -    name = "ylabel_fontweight"
    -    args = dict(name=name, val=ylabel_fontweight)
    -    list_weights = ['light', 'medium', 'bold']
    -    if type(ylabel_fontweight) in [float, int]:
    -        ut.check_non_negative_number(**args, min_val=0, max_val=1000, just_int=False)
    -    elif isinstance(ylabel_fontweight, str):
    -        if ylabel_fontweight not in list_weights:
    -            error = f"'{name}' ({ylabel_fontweight}) should be one of following: {list_weights}"
    -            raise ValueError(error)
    -    else:
    -        error = f"'{name}' ({ylabel_fontweight}) should be either numeric value in range 0-1000" \
    -                f"\n\tor one of following: {list_weights}"
    -        raise ValueError(error)
    -
    -
    -# Plotting functions
    -def _get_df_pos(df_feat=None, df_cat=None, y="subcategory", val_col="mean_dif",
    -                value_type="mean", normalize=False,
    -                tmd_len=20, jmd_n_len=10, jmd_c_len=10, start=1):
    -    """Helper method for plotting"""
    -    normalize, normalize_for_pos = check_normalize(normalize=normalize)
    -    cpp_plot = CPPPlots(tmd_len=tmd_len, jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len, start=start)
    -    df_pos = cpp_plot.get_df_pos(df=df_feat.copy(), y=y, value_type=value_type, val_col=val_col,
    -                                 normalize=normalize,
    -                                 normalize_for_pos=normalize_for_pos)
    -    # Sort according to given categories
    -    list_cat = list(df_cat[y].drop_duplicates())
    -    list_col = list(df_pos.T)
    -    sorted_col = [x for x in list_cat if x in list_col]
    -    df_pos = df_pos.T[sorted_col].T
    -    return df_pos
    -
    -
    -def _add_importance_map(ax=None, df_feat=None, df_cat=None, start=None, args_len=None, y=None):
    -    """"""
    -    _df_pos = _get_df_pos(df_feat=df_feat, df_cat=df_cat, y=y, val_col=ut.COL_FEAT_IMPORTANCE,
    -                          value_type="sum", normalize="positions_only", start=start, **args_len)
    -    _df = pd.melt(_df_pos.reset_index(), id_vars="index")
    -    _df.columns = [ut.COL_SUBCAT, "position", ut.COL_FEAT_IMPORTANCE]
    -    _list_sub_cat = _df[ut.COL_SUBCAT].unique()
    -    for i, sub_cat in enumerate(_list_sub_cat):
    -        _dff = _df[_df[ut.COL_SUBCAT] == sub_cat]
    -        for pos, val in enumerate(_dff[ut.COL_FEAT_IMPORTANCE]):
    -            _symbol = "■"  # "•"
    -            color = "black"
    -            size = 12 if val >= 1 else (8 if val >= 0.5 else 4)
    -            _args_symbol = dict(ha="center", va="center", color=color, size=size)
    -            if val >= 0.2:
    -                ax.text(pos + 0.5, i + 0.5, _symbol, **_args_symbol)
    -
    -
    -def _set_size_to_optimized_value(seq_size=None, tmd_jmd_fontsize=None, opt_size=None):
    -    """Set sizes to given value if None"""
    -    if tmd_jmd_fontsize is None:
    -        tmd_jmd_fontsize = opt_size
    -    args_size = dict(seq_size=seq_size, tmd_jmd_fontsize=tmd_jmd_fontsize)
    -    return args_size
    -
    -
    -# TODO simplify interface (delete old profile)
    -# TODO add importance plot for heatmap
    -# TODO add ranking
    -
    -# II Main Functions
    -
    [docs]class CPPPlot: - """ - Create and filter features that are most discriminant between two sets of sequences. - - Parameters - ---------- - accept_gaps : bool, default = False - Whether to accept missing values by enabling omitting for computations (if True). - jmd_n_len : int, >=0, default = 10 - Length of JMD-N. - jmd_c_len : int, >=0, default = 10 - Length of JMD-C. - ext_len : int, >=0, default = 4 - Length of TMD-extending part (starting from C and N terminal part of TMD). - Conditions: ext_len < jmd_m_len and ext_len < jmd_c_len. - verbose : bool, default = True - Whether to print progress information about the algorithm (if True). - - """ -
    [docs] def __init__(self, df_cat=None, accept_gaps=False, jmd_n_len=10, jmd_c_len=10, ext_len=4, verbose=True): - # Load default scales if not specified - - ut.check_bool(name="verbose", val=verbose) - if df_cat is None: - df_cat = aaanalysis.load_scales(name=ut.COL_SCALE_ID) - self.df_cat = df_cat - self._verbose = verbose - self._accept_gaps = accept_gaps - # Set consistent length of JMD_N, JMD_C, TMD flanking amino acids (TMD-E) - self.jmd_n_len = jmd_n_len - self.jmd_c_len = jmd_c_len - self.ext_len = ext_len - # Axes dict for plotting - self.ax_seq = None
    - - # Plotting methods -
    [docs] def profile(self, df_feat=None, y="category", val_col="mean_dif", val_type="count", normalize=False, - figsize=(7, 5), title=None, title_kws=None, - dict_color=None, edge_color="none", bar_width=0.75, - add_jmd_tmd=True, tmd_len=20, start=1, - jmd_n_seq=None, tmd_seq=None, jmd_c_seq=None, - tmd_color="mediumspringgreen", jmd_color="blue", tmd_seq_color="black", jmd_seq_color="white", - seq_size=None, tmd_jmd_fontsize=None, - xtick_size=11.0, xtick_width=2.0, xtick_length=5.0, xticks_pos=False, - ytick_size=None, ytick_width=2.0, ytick_length=5.0, ylim=None, - highlight_tmd_area=True, highlight_alpha=0.15, - grid=False, grid_axis="both", - add_legend_cat=True, legend_kws=None, - shap_plot=False, - **kwargs): - """ - Plot feature profile for given features from 'df_feat'. - - Parameters - ---------- - df_feat : class:`pandas.DataFrame`, optional, default=None - Dataframe containing the features to be plotted. If None, default features from the instance will be used. - y : str, default='category' - Column name in df_feat which contains the categories for grouping. - val_col : str, default='mean_dif' - Column name in df_feat which contains the values to be plotted. - val_type : str, default='count' - Type of value. Available options are specified by the `check_value_type` function. - normalize : bool, default=False - If True, the feature values will be normalized. - figsize : tuple, default=(7, 5) - Size of the plot. - title : str, optional - Title of the plot. - title_kws : dict, optional - Keyword arguments to customize the title appearance. - dict_color : dict, optional - Dictionary mapping categories to colors. - edge_color : str, default='none' - Color of the edges of the bars. - bar_width : float, default=0.75 - Width of the bars. - add_jmd_tmd : bool, default=True - If True, adds JMD and TMD lines/annotations to the plot. - tmd_len : int, default=20 - Length of the TMD. - start : int, default=1 - Start position. - jmd_n_seq : str, optional - JMD N-terminal sequence. - tmd_seq : str, optional - TMD sequence. - jmd_c_seq : str, optional - JMD C-terminal sequence. - tmd_color : str, default='mediumspringgreen' - Color for TMD. - jmd_color : str, default='blue' - Color for JMD. - tmd_seq_color : str, default='black' - Color for TMD sequence. - jmd_seq_color : str, default='white' - Color for JMD sequence. - seq_size : float, optional - Font size for sequence annotations. - tmd_jmd_fontsize : float, optional - Font size for TMD and JMD annotations. - xtick_size : float, default=11.0 - Size for x-tick labels. - xtick_width : float, default=2.0 - Width of the x-ticks. - xtick_length : float, default=5.0 - Length of the x-ticks. - xticks_pos : bool, default=False - If True, x-tick positions are adjusted based on given sequences. - ytick_size : float, optional - Size for y-tick labels. - ytick_width : float, default=2.0 - Width of the y-ticks. - ytick_length : float, default=5.0 - Length of the y-ticks. - ylim : tuple, optional - Y-axis limits. - highlight_tmd_area : bool, default=True - If True, highlights the TMD area on the plot. - highlight_alpha : float, default=0.15 - Alpha value for TMD area highlighting. - grid : bool, default=False - If True, a grid is added to the plot. - grid_axis : str, default='both' - Axis on which the grid is drawn. Options: 'both', 'x', 'y'. - add_legend_cat : bool, default=True - If True, a legend is added for categories. - legend_kws : dict, optional - Keyword arguments for the legend. - shap_plot : bool, default=False - If True, SHAP (SHapley Additive exPlanations) plot is generated. - **kwargs : dict - Other keyword arguments passed to internal functions or plotting libraries. - - Returns - ------- - ax : matplotlib.axes.Axes - The axes object containing the plot. - - """ - # Group arguments - args_seq = dict(jmd_n_seq=jmd_n_seq, tmd_seq=tmd_seq, jmd_c_seq=jmd_c_seq,) - args_size = check_args_size(seq_size=seq_size, tmd_jmd_fontsize=tmd_jmd_fontsize) - args_len = ut.check_args_len(tmd_len=tmd_len, jmd_n_len=self.jmd_n_len, jmd_c_len=self.jmd_c_len, **args_seq) - args_xtick = check_args_xtick(xtick_size=xtick_size, xtick_width=xtick_width, xtick_length=xtick_length) - args_part_color = check_part_color(tmd_color=tmd_color, jmd_color=jmd_color) - args_seq_color = check_seq_color(tmd_seq_color=tmd_seq_color, jmd_seq_color=jmd_seq_color) - - # Checking input - # Args checked by Matplotlib: title, legend_kws - # Args checked by internal plotting functions: ylim - ut.check_non_negative_number(name="bar_width", val=bar_width, min_val=0, just_int=False) - ut.check_non_negative_number(name="start", val=start, min_val=0) - ut.check_non_negative_number(name="tmd_area_alpha", val=highlight_alpha, min_val=0, max_val=1, just_int=False) - ut.check_bool(name="add_jmd_tmd", val=add_jmd_tmd) - ut.check_bool(name="highlight_tmd_area", val=highlight_tmd_area) - ut.check_bool(name="grid", val=grid) - ut.check_bool(name="shap_plot", val=shap_plot) - ut.check_bool(name="add_legend_cat", val=add_legend_cat) - ut.check_color(name="edge_color", val=edge_color, accept_none=True) - ut.check_dict(name="legend_kws", val=legend_kws, accept_none=True) - - ut.check_col_in_df(df=df_feat, name_df="df_feat", col=val_col, col_type=[float, int]) - ut.check_y_categorical(df=df_feat, y=y) - df_feat = ut.check_df_feat(df_feat=df_feat) - check_value_type(val_type=val_type, count_in=True) - check_args_ytick(ytick_size=ytick_size, ytick_width=ytick_width, ytick_length=ytick_length) - check_figsize(figsize=figsize) - dict_color = check_dict_color(dict_color=dict_color, df_cat=self.df_cat) - check_grid_axis(grid_axis=grid_axis) - # Get df positions - df_feat = self.add_positions(df_feat=df_feat, tmd_len=args_len["tmd_len"], start=start) - df_pos = _get_df_pos(df_feat=df_feat, df_cat=self.df_cat, y=y, val_col=val_col, - value_type=val_type, normalize=normalize, start=start, **args_len) - # Plotting - cpp_plot = CPPPlots(**args_len, start=start) - try: - ax = cpp_plot.profile(df_pos=df_pos, figsize=figsize, ylim=ylim, - dict_color=dict_color, edge_color=edge_color, bar_width=bar_width, - add_legend=add_legend_cat, legend_kws=legend_kws, shap_plot=shap_plot, - **args_xtick, **kwargs) - except AttributeError as e: - error_message = check_parameters(func=self.profile, name_called_func="pd.DataFrame.plot", e=e) - raise AttributeError(error_message) - cpp_plot.set_title(title=title, title_kws=title_kws) - - # Autosize tmd sequence & annotation - opt_size = cpp_plot.optimize_label_size(ax=ax, df_pos=df_pos, label_term=False) - # Set default ylabel - ylabel = "Feature impact" if shap_plot else f"Feature count (-/+ {val_col})" - ax.set_ylabel(ylabel, size=opt_size) - # Adjust y ticks - ytick_size = opt_size if ytick_size is None else ytick_size - plt.yticks(size=ytick_size) - plt.tick_params(axis="y", color="black", width=ytick_width, length=ytick_length, bottom=False) - sns.despine(top=True, right=True) - # Add grid - if grid: - ax.set_axisbelow(True) # Grid behind datasets - ax.grid(which="major", axis=grid_axis, linestyle="-") - # Add tmd area - if highlight_tmd_area: - cpp_plot.highlight_tmd_area(ax=ax, x_shift=-0.5, tmd_color=tmd_color, alpha=highlight_alpha) - # Add tmd_jmd sequence if sequence is given - if type(tmd_seq) == str: - ax = cpp_plot.add_tmd_jmd_seq(ax=ax, **args_seq, **args_size, **args_part_color, **args_seq_color, - xticks_pos=xticks_pos, heatmap=False, x_shift=0, - xtick_size=xtick_size) # Add tmd_jmd bar - self.ax_seq = ax - elif add_jmd_tmd: - size = opt_size if tmd_jmd_fontsize is None else tmd_jmd_fontsize - cpp_plot.add_tmd_jmd_bar(ax=ax, x_shift=-0.5, **args_part_color, add_white_bar=False) - cpp_plot.add_tmd_jmd_xticks(ax=ax, x_shift=0, **args_xtick) - cpp_plot.add_tmd_jmd_text(ax=ax, x_shift=-0.5, tmd_jmd_fontsize=size) - - # Set current axis to main axis object depending on tmd sequence given or not - plt.yticks(size=ytick_size) - plt.tick_params(axis="y", color="black", width=ytick_width, length=ytick_length, bottom=False) - plt.sca(plt.gcf().axes[0]) - ax = plt.gca() - return ax
    - -
    [docs] def heatmap(self, df_feat=None, y="subcategory", val_col="mean_dif", val_type="mean", normalize=False, - figsize=(8, 5), title=None, title_kws=None, - vmin=None, vmax=None, grid_on=True, - cmap="RdBu_r", cmap_n_colors=None, dict_color=None, cbar_kws=None, facecolor_dark=False, - add_jmd_tmd=True, tmd_len=20, start=1, - jmd_n_seq=None, tmd_seq=None, jmd_c_seq=None, - tmd_color="mediumspringgreen", jmd_color="blue", tmd_seq_color="black", jmd_seq_color="white", - seq_size=None, tmd_jmd_fontsize=None, - xticks_pos=False, xtick_size=11.0, xtick_width=2.0, xtick_length=5.0, ytick_size=None, - add_legend_cat=True, legend_kws=None, - add_importance_map=False, cbar_pct=False, **kwargs): - """ - Plot a featuremap of the selected value column with scale information (y-axis) versus sequence position (x-axis). - - This is a wrapper function for :func:`seaborn.heatmap`, designed to highlight differences between two sets - of sequences at the positional level (e.g., amino acid level for protein sequences). - - Parameters - ---------- - df_feat : :class:`~pandas.DataFrame`, shape (n_feature, n_feature_information) - DataFrame containing unique identifiers, scale information, statistics, and positions for each feature. - y : {'category', 'subcategory', 'scale_name'}, str, default = 'subcategory' - Name of the column in the feature DataFrame representing scale information (shown on the y-axis). - val_col : {'mean_dif', 'feat_impact', 'abs_auc', 'std_test', ...}, str, default = 'mean_dif' - Name of the column in the feature DataFrame containing numerical values to display. - val_type : {'mean', 'sum', 'std'}, str, default = 'mean' - Method to aggregate numerical values from 'val_col'. - normalize : {True, False, 'positions', 'positions_only'}, bool/str, default = False - Specifies normalization for numerical values in 'val_col': - - - False: Set value at all positions of a feature without further normalization. - - - True: Set value at all positions of a feature and normalize across all features. - - - 'positions': Value/number of positions set at each position of a feature and normalized across features. - Recommended when aiming to emphasize features with fewer positions using 'val_col'='feat_impact' and 'value_type'='mean'. - - figsize : tuple(float, float), default = (10,7) - Width and height of the figure in inches passed to :func:`matplotlib.pyplot.figure`. - title : str, optional - Title of figure used by :func:`matplotlib.pyplot.title`. - title_kws : dict, optional - Keyword arguments passed to :func:`matplotlib.pyplot.title`. - vmin, vmax : float, optional - Values to anchor the colormap, otherwise, inferred from data and other keyword arguments. - cmap : matplotlib colormap name or object, or list of colors, default = 'seismic' - Name of color map assigning data values to color space. If 'SHAP', colors from - `SHAP <https://shap.readthedocs.io/en/latest/index.html>`_ will be used (recommended for feature impact). - cmap_n_colors : int, optional - Number of discrete steps in diverging or sequential color map. - dict_color : dict, optional - Map of colors for scale categories classifying scales shown on y-axis. - cbar_kws : dict of key, value mappings, optional - Keyword arguments for :meth:`matplotlib.figure.Figure.colorbar`. - add_jmd_tmd : bool, default = True - Whether to add colored bar under heatmap indicating sequence parts (JMD-N, TMD, JMD-C). - tmd_len : int, >0 - Length of TMD to be depiceted. - start : int, >=0 - Position label of first amino acid position (starting at N-terminus). - tmd_seq : str, optional - Sequence of TMD. 'tmd_len' is set to length of TMD if sequence for TMD, JMD-N and JMD-C are given. - Recommended if feature impact or mean difference should be depicted for one sample. - jmd_n_seq : str, optional - Sequence of JMD_N. 'jmd_n_len' is set to length of JMD_N if sequence for TMD, JMD-N and JMD-C are given. - Recommended if feature impact or mean difference should be depicted for one sample. - jmd_c_seq : str, optional - Sequence of JMD_C. 'jmd_c_len' is set to length of JMD_C if sequence for TMD, JMD-N and JMD-C are given. - Recommended if feature impact or mean difference should be depicted for one sample. - tmd_color : str, default = 'mediumspringgreen' - Color of TMD bar. - jmd_color : str, default = 'blue' - Color of JMD-N and JMD-C bar. - tmd_seq_color : str, default = 'black' - Color of TMD sequence. - jmd_seq_color : str, default = 'white' - Color of JMD-N and JMD-C sequence. - seq_size : float, optional - Font size of all sequence parts in points. If None, optimized automatically. - tmd_jmd_fontsize : float, optional - Font size of 'TMD', 'JMD-N' and 'JMD-C' label in points. If None, optimized automatically. - xtick_size : float, default = 11.0 - Size of x ticks in points. Passed as 'size' argument to :meth:`matplotlib.axes.Axes.set_xticklabels`. - xtick_width : float, default = 2.0 - Widht of x ticks in points. Passed as 'width' argument to :meth:`matplotlib.axes.Axes.tick_params`. - xtick_length : float, default = 5.0, - Length of x ticks in points. Passed as 'length' argument to :meth:`matplotlib.axes.Axes.tick_params`. - ytick_size : float, optional - Size of scale information as y ticks in points. Passed to :meth:`matplotlib.axes.Axes.tick_params`. - If None, optimized automatically. - add_legend_cat : bool, default = True, - Whether to add legend for categories under plot and classification of scales at y-axis. - legend_kws : dict, optional - Keyword arguments passed to :meth:`matplotlib.axes.Axes.legend` - kwargs : other keyword arguments - All other keyword arguments passed to :meth:`matplotlib.axes.Axes.pcolormesh`. - - Returns - ------- - ax : matplotlib Axes - Axes object containing the heatmap. - - Warnings - -------- - - 'cmap_n_colors' is effective only if 'vmin' and 'vmax' align with the data. - - - 'tmd_seq_color' and 'jmd_seq_color' are applicable only when 'tmd_seq', 'jmd_n_seq', and 'jmd_c_seq' are provided. - - See Also - -------- - seaborn.heatmap - Plotting heatmap using seaborn. - See `Seaborn documentation <https://seaborn.pydata.org/generated/seaborn.heatmap.html>`_ for more details. - - Examples - -------- - - Plot CPP feature heatmap: - - .. plot:: - :context: close-figs - - >>> import matplotlib.pyplot as plt - >>> import aaanalysis as aa - >>> sf = aa.SequenceFeature() - >>> df_seq = aa.load_dataset(name='SEQ_DISULFIDE', min_len=100) - >>> labels = list(df_seq["label"]) - >>> df_parts = sf.get_df_parts(df_seq=df_seq, jmd_n_len=10, jmd_c_len=10) - >>> #split_kws = sf.get_split_kws(n_split_min=1, n_split_max=3, split_types=["Segment", "PeriodicPattern"]) - >>> #df_scales = aa.load_scales(unclassified_in=False).sample(n=10, axis=1) - >>> #cpp = aa.CPP(df_parts=df_parts, split_kws=split_kws, df_scales=df_scales) - >>> #df_feat = cpp.run(labels=labels) - >>> #cpp.plot_heatmap(df_feat=df_feat) - >>> #plt.tight_layout() - - """ - # Group arguments - args_seq = dict(jmd_n_seq=jmd_n_seq, tmd_seq=tmd_seq, jmd_c_seq=jmd_c_seq) - args_size = check_args_size(seq_size=seq_size, tmd_jmd_fontsize=tmd_jmd_fontsize) - args_len = ut.check_args_len(tmd_len=tmd_len, jmd_n_len=self.jmd_n_len, jmd_c_len=self.jmd_c_len, **args_seq) - args_xtick = check_args_xtick(xtick_size=xtick_size, xtick_width=xtick_width, xtick_length=xtick_length) - args_part_color = check_part_color(tmd_color=tmd_color, jmd_color=jmd_color) - args_seq_color = check_seq_color(tmd_seq_color=tmd_seq_color, jmd_seq_color=jmd_seq_color) - - # Checking input - # Args checked by Matplotlib: title, cmap, cbar_kws, legend_kws - ut.check_non_negative_number(name="start", val=start, min_val=0) - ut.check_non_negative_number(name="ytick_size", val=ytick_size, accept_none=True, just_int=False, min_val=1) - ut.check_non_negative_number(name="cmap_n_colors", val=cmap_n_colors, min_val=1, accept_none=True) - ut.check_bool(name="add_jmd_tmd", val=add_jmd_tmd) - ut.check_bool(name="add_legend_cat", val=add_legend_cat) - ut.check_dict(name="legend_kws", val=legend_kws, accept_none=True) - ut.check_dict(name="cbar_kws", val=cbar_kws, accept_none=True) - ut.check_col_in_df(df=df_feat, name_df="df_feat", col=val_col, col_type=[float, int]) - ut.check_y_categorical(df=df_feat, y=y) - df_feat = ut.check_df_feat(df_feat=df_feat, df_cat=self.df_cat) - check_value_type(val_type=val_type, count_in=False) - check_vmin_vmax(vmin=vmin, vmax=vmax) - check_figsize(figsize=figsize) - dict_color = check_dict_color(dict_color=dict_color, df_cat=self.df_cat) - - # Get df positions - df_feat = self.add_positions(df_feat=df_feat, tmd_len=args_len["tmd_len"], start=start) - df_pos = _get_df_pos(df_feat=df_feat, df_cat=self.df_cat, y=y, val_col=val_col, - value_type=val_type, normalize=normalize, start=start, **args_len) - # Plotting - cpp_plot = CPPPlots(**args_len, start=start) - cpp_plot.set_figsize(figsize=figsize) # figsize is not used as argument in seaborn (but in pandas) - try: - linecolor = "gray" if facecolor_dark else "black" - if "linecolor" in kwargs: - linecolor = kwargs["linecolor"] - else: - kwargs["linecolor"] = linecolor - ax = cpp_plot.heatmap(df_pos=df_pos, vmin=vmin, vmax=vmax, grid_on=grid_on, - cmap=cmap, cmap_n_colors=cmap_n_colors, cbar_kws=cbar_kws, - x_shift=0.5, ytick_size=ytick_size, facecolor_dark=facecolor_dark, - cbar_pct=cbar_pct, **args_xtick, **kwargs) - ax.axvline(self.jmd_n_len, color=linecolor, linestyle="-", linewidth=1.5) - ax.axvline(x=self.jmd_n_len + args_len["tmd_len"], color=linecolor, linestyle="-", linewidth=1.5) - - except AttributeError as e: - error_message = check_parameters(func=self.heatmap, name_called_func="sns.heatmap", e=e) - raise AttributeError(error_message) - cpp_plot.set_title(title=title, title_kws=title_kws) - # Autosize tmd sequence & annotation - opt_size = cpp_plot.optimize_label_size(ax=ax, df_pos=df_pos) - # Add importance map - if add_importance_map: - _add_importance_map(ax=ax, df_feat=df_feat, df_cat=self.df_cat, - start=start, args_len=args_len, y=y) - # Add scale classification - if add_legend_cat: - ax = cpp_plot.add_legend_cat(ax=ax, df_pos=df_pos, df_cat=self.df_cat, y=y, dict_color=dict_color, - legend_kws=legend_kws) - # Add tmd_jmd sequence if sequence is given - if isinstance(tmd_seq, str): - ax = cpp_plot.add_tmd_jmd_seq(ax=ax, **args_seq, **args_size, **args_part_color, **args_seq_color, - xticks_pos=xticks_pos, - x_shift=0.5, xtick_size=xtick_size) - self.ax_seq = ax - # Add tmd_jmd bar - elif add_jmd_tmd: - size = opt_size if tmd_jmd_fontsize is None else tmd_jmd_fontsize - cpp_plot.add_tmd_jmd_bar(ax=ax, **args_part_color) - cpp_plot.add_tmd_jmd_xticks(ax=ax, x_shift=0.5, **args_xtick) - cpp_plot.add_tmd_jmd_text(ax=ax, x_shift=0, tmd_jmd_fontsize=size) - # Set current axis to main axis object depending on tmd sequence given or not - plt.sca(plt.gcf().axes[0]) - ax = plt.gca() - return ax
    - -
    [docs] def update_seq_size(self): - """""" - # TODO legend changes slightly if sequnece length altered (e.g. PTPRM_MOUSE vs A4_HUMAN) - # TODO look for more extreme example and text - f = lambda l: l.get_window_extent(ax.figure.canvas.get_renderer()) - ax = self.ax_seq - labels = ax.xaxis.get_ticklabels(which="both") - tick_positions = [f(l).x0 for l in labels] - sorted_tick_positions, sorted_labels = zip(*sorted(zip(tick_positions, labels), key=lambda t: t[0])) - # Adjust font size to prevent overlap - seq_size = get_optimal_fontsize(ax, sorted_labels) - for l in sorted_labels: - l.set_fontsize(seq_size)
    -
    - -
    -
    - -
    -
    -
    -
    - - - - \ No newline at end of file diff --git a/docs/build/html/_modules/aaanalysis/cpp/feature.html b/docs/build/html/_modules/aaanalysis/cpp/feature.html deleted file mode 100644 index 6fe41a76..00000000 --- a/docs/build/html/_modules/aaanalysis/cpp/feature.html +++ /dev/null @@ -1,746 +0,0 @@ - - - - - - aaanalysis.cpp.feature — AAanalysis - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - -
    - -
    -
    -
    - -
    -
    -
    -
    - -

    Source code for aaanalysis.cpp.feature

    -"""
    -Script for SequenceFeature() object that combines scales, splits, and parts to create
    -    feature names, feature values, or a feature matrix for ML or CPP pipelines.
    -"""
    -import os
    -import pandas as pd
    -import numpy as np
    -import math
    -from itertools import repeat
    -import multiprocessing as mp
    -import warnings
    -
    -from aaanalysis.cpp._feature_pos import SequenceFeaturePositions
    -from aaanalysis.cpp._split import Split, SplitRange
    -from aaanalysis.cpp._part import Parts
    -
    -import aaanalysis as aa
    -import aaanalysis.utils as ut
    -
    -# TODO simplify and check
    -
    -
    -# I Helper Functions
    -# Check for add methods
    -def check_ref_group(ref_group=0, labels=None):
    -    """Check if ref group class lable"""
    -    if ref_group not in labels:
    -        raise ValueError(f"'ref_group' ({ref_group}) not class label: {set(labels)}.")
    -
    -
    -def check_sample_in_df_seq(sample_name=None, df_seq=None):
    -    """Check if sample name in df_seq"""
    -    list_names = list(df_seq[ut.COL_NAME])
    -    if sample_name not in list_names:
    -        error = f"'sample_name' ('{sample_name}') not in '{ut.COL_NAME}' of 'df_seq'." \
    -                f"\nValid names are: {list_names}"
    -        raise ValueError(error)
    -
    -
    -# Check load functions
    -def check_clustered(complete=False, clust_th=0.7):
    -    """Check input for loading functions"""
    -    if not complete and clust_th not in [0.9, 0.7, 0.5, 0.3]:
    -        raise ValueError("'clust_th' should be 0.3, 0.5, 0.7, or 0.9")
    -
    -
    -# Check functions get_split_kws
    -def check_split_types(split_types=None):
    -    """Check split_type"""
    -    if type(split_types) is str:
    -        split_types = [split_types]
    -    list_split_types = [ut.STR_SEGMENT, ut.STR_PATTERN, ut.STR_PERIODIC_PATTERN]
    -    if split_types is None:
    -        split_types = list_split_types
    -    if not set(list_split_types).issuperset(set(split_types)):
    -        raise ValueError(f"'split_types'({split_types}) must be in {list_split_types}")
    -    return split_types
    -
    -
    -def check_split_int_args(kwargs_int=None):
    -    """Check type of given arguments"""
    -    for arg in kwargs_int:
    -        arg_val = kwargs_int[arg]
    -        ut.check_non_negative_number(name=arg, val=arg_val)
    -
    -
    -def check_split_list_args(kwargs_list=None, accept_none=True):
    -    """Check type of given arguments"""
    -    for arg in kwargs_list:
    -        arg_val = kwargs_list[arg]
    -        if not (accept_none and arg_val is None):
    -            if type(arg_val) != list:
    -                raise ValueError(f"'{arg}' ({arg_val}) should be list with non-negative integers")
    -            else:
    -                for i in arg_val:
    -                    if type(i) != int or i < 0:
    -                        raise ValueError(f"Elements in '{arg}' ({arg_val}) should be non-negative integer")
    -
    -
    -# Check functions feature values
    -def _get_missing_elements(df_parts=None, scale_elements=None, accept_gaps=False):
    -    """Get missing elements"""
    -    seq_elements = set("".join(df_parts.values.flatten()))
    -    if accept_gaps:
    -        missing_elements = [x for x in seq_elements if x not in scale_elements and x != ut.STR_AA_GAP]
    -    else:
    -        missing_elements = [x for x in seq_elements if x not in scale_elements]
    -    return missing_elements
    -
    -
    -def check_dict_scale(dict_scale=None, df_parts=None, accept_gaps=False):
    -    """Check if dict_scale is dictionary with numerical values"""
    -    if not isinstance(dict_scale, dict):
    -        raise ValueError("'dict_scale' must be a dictionary with values of type float or int")
    -    if accept_gaps:
    -        f = lambda key: type(dict_scale[key]) not in [float, int]
    -    else:
    -        f = lambda key: type(dict_scale[key]) not in [float, int] or math.isnan(dict_scale[key])
    -    wrong_type = [(key, dict_scale[key]) for key in dict_scale if f(key)]
    -    if len(wrong_type) > 0:
    -        error = "'dict_scale' must be a dictionary with values of type float or int." \
    -                "\n Following key-value pairs are not accepted: {}".format(wrong_type)
    -        raise ValueError(error)
    -    # Check matching of scale to sequences of df_parts
    -    args = dict(df_parts=df_parts, scale_elements=list(dict_scale.keys()), accept_gaps=accept_gaps)
    -    missing_elements = _get_missing_elements(**args)
    -    if len(missing_elements) > 0:
    -        raise ValueError(f"Scale does not match for following sequence element: {missing_elements}")
    -
    -
    -# Check functions feature matrix
    -def check_df_scales_matches_df_parts(df_scales=None, df_parts=None, accept_gaps=False):
    -    """Check if df_scales has values for all Letters in sequences from df_parts"""
    -    args = dict(df_parts=df_parts, scale_elements=list(df_scales.index), accept_gaps=accept_gaps)
    -    missing_elements = _get_missing_elements(**args)
    -    if len(missing_elements) > 0:
    -        raise ValueError(f"Scale does not match for following sequence element: {missing_elements}")
    -
    -
    -def check_parts_in_df_parts(df_parts=None, part=None):
    -    """Check if part in df_parts"""
    -    if part.lower() not in list(df_parts):
    -        raise ValueError("'part' ({}) must be in columns of 'df_parts': {}".format(part, list(df_parts)))
    -
    -
    -# Functions to create feature (part + split + scale)
    -def _get_feature_components(feat_name=None, dict_all_scales=None):
    -    """Convert feature name into three feature components of part, split, and scale given as dictionary"""
    -    if feat_name is None or dict_all_scales is None:
    -        raise ValueError("'feature_name' and 'dict_all_scales' must be given")
    -    part, split, scale = feat_name.split("-")
    -    if scale not in dict_all_scales:
    -        raise ValueError("'scale' from 'feature_name' is not in 'dict_all_scales")
    -    dict_scale = dict_all_scales[scale]
    -    return part, split, dict_scale
    -
    -
    -def _feature_value(df_parts=None, split=None, dict_scale=None, accept_gaps=False):
    -    """Helper function to create feature values for feature matrix"""
    -    sp = Split()
    -    # Get vectorized split function
    -    split_type, split_kwargs = ut.check_split(split=split)
    -    f_split = getattr(sp, split_type.lower())
    -    # Vectorize split function using anonymous function
    -    vf_split = np.vectorize(lambda x: f_split(seq=x, **split_kwargs))
    -    # Get vectorized scale function
    -    vf_scale = ut.get_vf_scale(dict_scale=dict_scale, accept_gaps=accept_gaps)
    -    # Combine part split and scale to get feature values
    -    part_split = vf_split(df_parts)
    -    feature_value = np.round(vf_scale(part_split), 5)  # feature values
    -    return feature_value
    -
    -
    -def _feature_matrix(feat_names, dict_all_scales, df_parts, accept_gaps):
    -    """Helper function to create feature matrix via multiple processing"""
    -    feat_matrix = np.empty([len(df_parts), len(feat_names)])
    -    for i, feat_name in enumerate(feat_names):
    -        part, split, dict_scale = _get_feature_components(feat_name=feat_name,
    -                                                          dict_all_scales=dict_all_scales)
    -        check_parts_in_df_parts(df_parts=df_parts, part=part)
    -        feat_matrix[:, i] = _feature_value(split=split,
    -                                           dict_scale=dict_scale,
    -                                           df_parts=df_parts[part.lower()],
    -                                           accept_gaps=accept_gaps)
    -    return feat_matrix
    -    
    -    
    -# II Main Functions
    -
    [docs]class SequenceFeature: - """Retrieve and create sequence feature components (Part, Split, and Scale). - - Notes - ----- - Part: Feature Component - A continuous subset of a sequence like a protein domain (e.g, transmembrane domain of membrane proteins). - - Split: Feature Component - Principle to obtain a distinct subset of amino acids from a sequence part like a segment or a pattern. - - Scale: Feature Component - A physicochemical scale assigning each amino acid a numerical value between 0 and 1. - - Feature: Part + Split + Scale - Physicochemical property (expressed as numerical scale) present at distinct amino acid - positions within a protein sequence. The positions are obtained by splitting sequence parts - into segments or patterns. - - Feature value: Realization of a Feature - For a given sequence, a feature value is the average of a physicochemical scale over - all amino acids obtained by splitting a sequence part. - - List of valid sequence parts: - ['tmd', 'tmd_e', 'tmd_n', 'tmd_c', 'jmd_n', 'jmd_c', 'ext_c', 'ext_n', - 'tmd_jmd', 'jmd_n_tmd_n', 'tmd_c_jmd_c', 'ext_n_tmd_n', 'tmd_c_ext_c'] - """ - - # Basic datastructures for features -
    [docs] @staticmethod - def get_df_parts(df_seq=None, list_parts=None, jmd_n_len=None, jmd_c_len=None, ext_len=4, all_parts=False): - """Create DataFrane with sequence parts. - - Parameters - ---------- - df_seq: :class:`pandas.DataFrame` - DataFrame with sequence information comprising either sequence ('sequence', 'tmd_start', 'tmd_stop') - or sequence part ('jmd_n', 'tmd', 'jmd_c') columns. - list_parts: list of string, len>=1 - Names of sequence parts which should be created (e.g., 'tmd'). - jmd_n_len: int, default = None, optional - Length of JMD-N in number of amino acids. If None, 'jmd_n' column must be given in df_seq. - jmd_c_len: int, default = None, optional - Length of JMD-N in number of amino acids. If None, 'jmd_c' column must be given in df_seq. - ext_len: int, default = 4 - Lenght of N- resp. C-terminal extra part of TMD. - all_parts: bool, default = False - Whether to create DataFrame with all possible sequence parts (if True) or parts given by list_parts. - - Returns - ------- - df_parts: :class:`pandas.DataFrame` - DataFrame with sequence parts. - - Notes - ----- - List of valid sequence parts can be found in :class: ´aaanalysis.SequenceFeature´. - - Examples - -------- - Get sequence parts based on parts columns in df_seq with with 'tmd_e', and 'tmd_jmd' as parts: - - >>> import aaanalysis as aa - >>> sf = aa.SequenceFeature() - >>> df_seq = aa.load_dataset(name='GSEC_SUB_SEQ') - >>> df_parts = sf.get_df_parts(df_seq=df_seq, list_parts=["tmd_e", "tmd_jmd"]) - - Get sequence parts based on sequence column in df_seq and jmd_n_len and jmd_c_len with default parts: - - >>> import aaanalysis as aa - >>> sf = aa.SequenceFeature() - >>> df_seq = aa.load_dataset(name='GSEC_SUB_SEQ') - >>> df_parts = sf.get_df_parts(df_seq=df_seq, jmd_n_len=10, jmd_c_len=10) - """ - ut.check_args_len(jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len, ext_len=ext_len, accept_tmd_none=True) - df_seq = ut.check_df_seq(df_seq=df_seq, jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len) - list_parts = ut.check_list_parts(list_parts=list_parts, all_parts=all_parts) - seq_info_in_df = set(ut.COLS_SEQ_INFO).issubset(set(df_seq)) - pa = Parts() - dict_parts = {} - for i, row in df_seq.iterrows(): - entry = row[ut.COL_ENTRY] - if jmd_c_len is not None and jmd_n_len is not None and seq_info_in_df: - seq, start, stop = row[ut.COLS_SEQ_INFO].values - parts = pa.create_parts(seq=seq, tmd_start=start, tmd_stop=stop, - jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len) - jmd_n, tmd, jmd_c = parts.jmd_n, parts.tmd, parts.jmd_c - else: - jmd_n, tmd, jmd_c = row[ut.COLS_PARTS].values - dict_part_seq = pa.get_dict_part_seq(tmd=tmd, jmd_n=jmd_n, jmd_c=jmd_c, ext_len=ext_len) - dict_part_seq = {part: dict_part_seq[part] for part in list_parts} - dict_parts[entry] = dict_part_seq - df_parts = pd.DataFrame.from_dict(dict_parts, orient="index") - return df_parts
    - -
    [docs] @staticmethod - def get_split_kws(n_split_min=1, n_split_max=15, steps_pattern=None, n_min=2, n_max=4, len_max=15, - steps_periodicpattern=None, split_types=None): - """Create dictionary with kwargs for three split types: Segment, Pattern, PeriodicPattern - - Parameters - ---------- - n_split_min: int, default = 1 - Number greater 0 to specify the greatest Segment (e.g., 1/1 TMD alias whole TMD sequence). - n_split_max: int, default = 15, - Number greater n_split_min to specfiy the smallest Segment (e.g., 1/15 TMD). - steps_pattern: list of integers, default = [3, 4, 6, 7, 8] - Possible steps sizes for Pattern. - n_min: int, default = 2 - Minimum number of steps for Pattern. - n_max: int, default = 4 - Maximum number of steps for Pattern. - len_max: int, default = 10 - Maximum length in amino acid position for Pattern by varying start position. - steps_periodicpattern: list of integers, default = [3, 4] - Step sizes for PeriodicPattern. - split_types: list of strings, default = ["Segment", "Pattern" "PeriodicPattern"] - Split types for which paramter dictionary should be generated. - - Returns - ------- - split_kws: dict - Nested dictionary with parameters for chosen split_types: - - a) Segment: {n_split_min:1, n_split_max=15} - b) Pattern: {steps=[3, 4], n_min=2, n_max=4, len_max=15} - c) PeriodicPattern: {steps=[3, 4]} - - Examples - -------- - Get default arguments for all splits types (Segment, Pattern, PeriodicPattern): - - >>> import aaanalysis as aa - >>> sf = aa.SequenceFeature() - >>> split_kws = sf.get_split_kws() - - Get default argumetns for Segment split: - - >>> import aaanalysis as aa - >>> sf = aa.SequenceFeature() - >>> split_kws = sf.get_split_kws(split_types="Segment") - """ - split_types = check_split_types(split_types=split_types) - args_int = dict(n_split_min=n_split_min, n_split_max=n_split_max, n_min=n_min, n_max=n_max, len_max=len_max) - check_split_int_args(kwargs_int=args_int) - args_list = dict(steps_pattern=steps_pattern, steps_periodicpattern=steps_periodicpattern) - check_split_list_args(kwargs_list=args_list) - if steps_pattern is None: - # Differences between interacting amino acids in helix (without gaps) include 6, 7 ,8 to include gaps - steps_pattern = [3, 4] - if steps_periodicpattern is None: - steps_periodicpattern = [3, 4] # Differences between interacting amino acids in helix (without gaps) - split_kws = {ut.STR_SEGMENT: dict(n_split_min=n_split_min, n_split_max=n_split_max), - ut.STR_PATTERN: dict(steps=steps_pattern, n_min=n_min, n_max=n_max, len_max=len_max), - ut.STR_PERIODIC_PATTERN: dict(steps=steps_periodicpattern)} - split_kws = {x: split_kws[x] for x in split_types} - ut.check_split_kws(split_kws=split_kws) - return split_kws
    - -
    [docs] def get_features(self, list_parts=None, split_kws=None, df_scales=None, all_parts=False): - """Create list of all feature ids for given Parts, Splits, and Scales - - Parameters - ---------- - list_parts: list of strings (n>=1 parts), default = ["tmd_e", "jmd_n_tmd_n", "tmd_c_jmd_c"] - Names of sequence parts which should be created (e.g., 'tmd'). - split_kws: dict, default = SequenceFeature.get_split_kws - Nested dictionary with parameter dictionary for each chosen split_type. - df_scales: :class:`pandas.DataFrame`, default = SequenceFeature.load_scales - DataFrame with default amino acid scales. - all_parts: bool, default = False - Whether to create DataFrame with all possible sequence parts (if True) or parts given by list_parts. - - Returns - ------- - features: list of strings - Ids of all possible features for combination of Parts, Splits, and Scales with form: PART-SPLIT-SCALE - - """ - list_parts = ut.check_list_parts(list_parts=list_parts, all_parts=all_parts) - ut.check_split_kws(split_kws=split_kws) - ut.check_df_scales(df_scales=df_scales, accept_none=True) - if df_scales is None: - df_scales = aa.load_scales() - if split_kws is None: - split_kws = self.get_split_kws() - scales = list(df_scales) - spr = SplitRange() - features = [] - for split_type in split_kws: - args = split_kws[split_type] - labels_s = getattr(spr, "labels_" + split_type.lower())(**args) - features.extend(["{}-{}-{}".format(p.upper(), s, sc) - for p in list_parts - for s in labels_s - for sc in scales]) - return features
    - -
    [docs] @staticmethod - def feat_matrix(features=None, df_parts=None, df_scales=None, accept_gaps=False, - n_jobs=None, verbose=False, return_labels=False): - """Create feature matrix for given feature ids and sequence parts. - - Parameters - ---------- - features: str, list of strings, pd.Series - Ids of features for which matrix of feature values should be created. - df_parts: :class:`pandas.DataFrame` - DataFrame with sequence parts. - df_scales: :class:`pandas.DataFrame`, optional - DataFrame with default amino acid scales. - accept_gaps: bool, default = False - Whether to accept missing values by enabling omitting for computations (if True). - n_jobs: int, default = None, - The number of jobs to run in parallel. If None, it will be set to the maximum. - verbose: bool, default = True - Whether to print size of to be created feature matrix (if True) or not otherwise. - return_labels: bool, default = False - Whether to return sample labels in addition to feature matrix. - - Returns - ------- - feat_matrix: array-like or sparse matrix, shape (n_samples, n_features) - Feature values of samples. - """ - ut.check_non_negative_number(name="j_jobs", val=n_jobs, accept_none=True, min_val=1, just_int=True) - if df_scales is None: - df_scales = aa.load_scales() - ut.check_df_scales(df_scales=df_scales) - ut.check_df_parts(df_parts=df_parts) - features = ut.check_features(features=features, parts=df_parts, df_scales=df_scales) - check_df_scales_matches_df_parts(df_scales=df_scales, df_parts=df_parts, accept_gaps=accept_gaps) - if verbose: - n_feat = len(features) - n_samples = len(df_parts) - n_vals = n_feat * n_samples - print(f"Feature matrix for {n_feat} features and {n_samples} samples will be created") - if n_vals > 1000*1000: - warning = f"Feature matrix with n={n_vals}>=10^6 values will be created, which will take some time.\n" \ - "It is recommended to create a feature matrix for a pre-selected number features " \ - "so that 10^6 values are not exceeded." - warnings.warn(warning) - # Create feature matrix using parallel processing - dict_all_scales = ut.get_dict_all_scales(df_scales=df_scales) - n_processes = min([os.cpu_count(), len(features)]) if n_jobs is None else n_jobs - feat_chunks = np.array_split(features, n_processes) - args = zip(feat_chunks, repeat(dict_all_scales), repeat(df_parts), repeat(accept_gaps)) - with mp.get_context("spawn").Pool(processes=n_processes) as pool: - result = pool.starmap(_feature_matrix, args) - feat_matrix = np.concatenate(result, axis=1) - if return_labels: - if verbose: - print("Tuple of (feat_matrix, labels) will be returned") - labels = df_parts.index.tolist() - return feat_matrix, labels # X, y - else: - if verbose: - print("Only feat_matrix (without labels) will be returned") - return feat_matrix # X
    - - # Additional feature related methods -
    [docs] @staticmethod - def feat_names(features=None, df_cat=None, tmd_len=20, jmd_c_len=10, jmd_n_len=10, ext_len=0, start=1): - """Convert feature ids (PART-SPLIT-SCALE) into feature names (scale name [positions]). - - Parameters - ---------- - features: str, list of strings, pd.Series - Ids of features for which feature names should be created. - df_cat: :class:`pandas.DataFrame`, default = SequenceFeature.load_categories - DataFrame with default categories for physicochemical amino acid scales - tmd_len: int, >0 - Length of TMD. - jmd_n_len: int, >0 - Length of JMD-N. - jmd_c_len: int, >0 - Length of JMD-C. - ext_len:int, >0 - Length of TMD-extending part (starting from C and N terminal part of TMD). - Conditions: ext_len<jmd_m_len and ext_len<jmd_c_len - start: int, >=0 - Position label of first amino acid position (starting at N-terminus). - - Returns - ------- - feat_names: list of strings - Names of features. - - Notes - ----- - Positions are given depending on the three split types: - - Segment: [first...last] - - Pattern: [all positions] - - PeriodicPattern: [first..step1/step2..last] - """ - # Check input (length checked in SequenceFeaturePositions) - features = ut.check_features(features=features) - ut.check_df_cat(df_cat=df_cat) - if df_cat is None: - df_cat = aa.load_scales(name=ut.STR_SCALE_CAT) - # Get feature names - sfp = SequenceFeaturePositions() - dict_part_pos = sfp.get_dict_part_pos(tmd_len=tmd_len, jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len, - ext_len=ext_len, start=start) - list_positions = sfp.get_positions(dict_part_pos=dict_part_pos, features=features) - dict_scales = dict(zip(df_cat[ut.COL_SCALE_ID], df_cat[ut.COL_SCALE_NAME])) - feat_names = [] - for feat_id, pos in zip(features, list_positions): - part, split, scale = feat_id.split("-") - split_type = split.split("(")[0] - if split_type == ut.STR_SEGMENT and len(pos.split(",")) > 2: - pos = pos.split(",")[0] + "..." + pos.split(",")[-1] - if split_type == ut.STR_PERIODIC_PATTERN: - step = split.split("+")[1].split(",")[0] - pos = pos.split(",")[0] + ".." + step + ".." + pos.split(",")[-1] - feat_names.append(f"{dict_scales[scale]} [{pos}]") - return feat_names
    - - # Feature: Part + Split + Scale - # For what used? Not redudant with feature matrix? - # TODO Add functions (modify df_feat) -
    [docs] @staticmethod - def add_feat_value(df_parts=None, split=None, dict_scale=None, accept_gaps=False): - """Create feature values for all sequence parts by combining Part, Split, and Scale. - - Parameters - ---------- - df_parts: :class:`pandas.DataFrame` - DataFrame with sequence parts. - split: str - Name of Split following given convention. - dict_scale: dict - Dictionary mapping a numerical value to each letter of given sequences - accept_gaps: bool, default = False - Whether to accept missing values by enabling omitting for computations (if True). - - Returns - ------- - feature_value: array-like, shape (n_samples, n_parts) - Average scale values over sequence parts. - - Notes - ----- - A split name should has the form of PART-SPLIT-SCALE, where following structures - are given for the three split types: - - - Segment(i-th,n_split) - with i-th<=n_split and - where 'i-th' and 'n_split' indicate the i-th Segment resp. the number of Segments. - - - Pattern(N/C,p1,p2,...,pn) - with p1<p2<...<pn indicating amino acid positions and - 'N/C' whether the splits starts from the N resp. C-terminal sequence end. - - - PeriodicPattern(N/C,i+step1/step2,start) - where 'step1/step2' indicates the step size of each odd resp. even step and - 'start' gives the first position starting from the N- or C-terminal sequence end. - - All numbers should be non-negative integers. Examples for each split type - are as follows: 'Segment(5,7)', 'Pattern(C,1,2)', 'PeriodicPattern(N,i+2/3,1)'. - """ - ut.check_df_parts(df_parts=df_parts) - ut.check_split(split=split) - check_dict_scale(dict_scale=dict_scale, df_parts=df_parts, accept_gaps=accept_gaps) - feature_value = _feature_value(df_parts=df_parts, - split=split, - dict_scale=dict_scale, - accept_gaps=accept_gaps) - return feature_value
    - -
    [docs] @staticmethod - def add_dif(df_feat=None, df_seq=None, labels=None, sample_name=str, ref_group=0, - accept_gaps=False, jmd_n_len=10, jmd_c_len=10, df_parts=None, df_scales=None): - """ - Add feature value difference between sample and reference group to DataFrame. - - Parameters - ---------- - df_feat: :class:`pandas.DataFrame` - Feature DataFrame (CPP output) to add sample difference. - df_seq: :class:`pandas.DataFrame` - DataFrame with sequences and sample names, in which the given sample name is included. - labels: array-like, shape (n_samples) - Class labels for samples in sequence DataFrame. - sample_name: str - Name of sample for which the feature value difference to a given reference group should be computed. - ref_group: int, default = 0 - Class label of reference group. - accept_gaps: bool, default = False - Whether to accept missing values by enabling omitting for computations (if True). - - Returns - ------- - df_feat: :class:`pandas.DataFrame` - Feature DataFrame including feature value difference. - """ - # Check input - df_feat = ut.check_df_feat(df_feat=df_feat) - ut.check_df_seq(df_seq=df_seq, jmd_c_len=jmd_c_len, jmd_n_len=jmd_n_len) - ut.check_labels(labels=labels, df=df_seq, name_df="df_seq") - check_ref_group(ref_group=ref_group, labels=labels) - check_sample_in_df_seq(sample_name=sample_name, df_seq=df_seq) - # Add sample difference to reference group - sf = SequenceFeature() - X = sf.feat_matrix(features=list(df_feat["feature"]), - df_parts=df_parts, - df_scales=df_scales, - accept_gaps=accept_gaps) - mask = [True if x == ref_group else False for x in labels] - i = list(df_seq[ut.COL_NAME]).index(sample_name) - df_feat[f"dif_{sample_name}"] = X[i] - X[mask].mean() - return df_feat
    - -
    [docs] @staticmethod - def add_position(df_feat=None, features=None, start=1, tmd_len=20, jmd_n_len=10, jmd_c_len=10, ext_len=4, - part_split=False): - """Create list with positions for given feature names - - Parameters - ---------- - df_feat: :class:`pandas.DataFrame` - Feature DataFrame (CPP output) to add sample difference. - features: str, list of strings, pd.Series - Ids of features for which feature names should be created. - start: int, >=0, default = 1 - Position label of first amino acid position (starting at N-terminus). - tmd_len: int, >0, default = 20 - Length of TMD. - jmd_n_len : int, >=0, default = 10 - Length of JMD-N. - jmd_c_len : int, >=0, default = 10 - Length of JMD-C. - ext_len : int, >=0, default = 4 - Length of TMD-extending part (starting from C and N terminal part of TMD). - Conditions: ext_len < jmd_m_len and ext_len < jmd_c_len. - - Returns - ------- - feat_positions: list - list with positions for each feature in feat_names - - Notes - ----- - The length parameters define the total number of positions (jmd_n_len + tmd_len + jmd_c_len). - """ - # TODO add sequence, generalize check functions for tmd_len ... - features = ut.check_features(features=features) - ut.check_non_negative_number(name="tmd_len", val=tmd_len, just_int=True, min_val=1) - args = dict(jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len, ext_len=ext_len, start=start) - for name in args: - ut.check_non_negative_number(name=name, val=args[name], just_int=True, min_val=0) - sfp = SequenceFeaturePositions() - dict_part_pos = sfp.get_dict_part_pos(tmd_len=tmd_len, **args) - feat_positions = sfp.get_positions(dict_part_pos=dict_part_pos, features=features) - return feat_positions
    - -
    - -
    -
    - -
    -
    -
    -
    - - - - \ No newline at end of file diff --git a/docs/build/html/_modules/aaanalysis/data_loader/data_loader.html b/docs/build/html/_modules/aaanalysis/data_loader/data_loader.html deleted file mode 100644 index ec73ec85..00000000 --- a/docs/build/html/_modules/aaanalysis/data_loader/data_loader.html +++ /dev/null @@ -1,294 +0,0 @@ - - - - - - aaanalysis.data_loader.data_loader — AAanalysis - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - -
    - -
    -
    -
    -
      -
    • - - -
    • -
    • -
    -
    -
    -
    -
    - -

    Source code for aaanalysis.data_loader.data_loader

    -"""
    -This is a script for loading protein sequence benchmarking datasets and amino acid scales and
    -their two-level classification (AAontology).
    -"""
    -import os
    -import pandas as pd
    -import numpy as np
    -import re
    -from typing import Optional, Literal
    -import aaanalysis.utils as ut
    -
    -# Constants
    -STR_AA_GAP = "-"
    -LIST_CANONICAL_AA = ['N', 'A', 'I', 'V', 'K', 'Q', 'R', 'M', 'H', 'F', 'E', 'D', 'C', 'G', 'L', 'T', 'S', 'Y', 'W', 'P']
    -NAME_SCALE_SETS_BASE = [ut.STR_SCALES, ut.STR_SCALES_RAW]
    -NAMES_SCALE_SETS = NAME_SCALE_SETS_BASE + [ut.STR_SCALE_CAT, ut.STR_SCALES_PC, ut.STR_TOP60, ut.STR_TOP60_EVAL]
    -FOLDER_BENCHMARKS = folder_in = ut.FOLDER_DATA + "benchmarks" + ut.SEP
    -
    -# I Helper Functions
    -
    -
    -# II Main Functions
    -def _adjust_non_canonical_aa(df=None, non_canonical_aa="remove"):
    -    """"""
    -    list_options = ["remove", "keep", "gap"]
    -    if non_canonical_aa not in list_options:
    -        raise ValueError(f"'non_canonical_aa' ({non_canonical_aa}) should be on of following: {list_options}")
    -    if non_canonical_aa == "keep":
    -        return df
    -    # Get all non-canonical amino acids
    -    f = lambda x: set(str(x))
    -    vf = np.vectorize(f)
    -    char_seq = set().union(*vf(df.values).flatten())
    -    list_non_canonical_aa = [x for x in char_seq if x not in LIST_CANONICAL_AA]
    -    if non_canonical_aa == "remove":
    -        pattern = '|'.join(list_non_canonical_aa)  # Joining list into a single regex pattern
    -        df = df[~df[ut.COL_SEQ].str.contains(pattern, regex=True)]
    -    else:
    -        df[ut.COL_SEQ] = [re.sub(f'[{"".join(list_non_canonical_aa)}]', STR_AA_GAP, x) for x in df[ut.COL_SEQ]]
    -    return df
    -
    -
    -def check_name_of_dataset(name="INFO", folder_in=None):
    -    """"""
    -    if name == "INFO":
    -        return
    -    list_datasets = [x.split(".")[0] for x in os.listdir(folder_in) if "." in x]
    -    if name not in list_datasets:
    -        list_aa = [x for x in list_datasets if 'AA' in x]
    -        list_seq = [x for x in list_datasets if 'SEQ' in x]
    -        list_dom = [x for x in list_datasets if 'DOM' in x]
    -        raise ValueError(f"'name' ({name}) is not valid."
    -                         f"\n Amino acid datasets: {list_aa}"
    -                         f"\n Sequence datasets: {list_seq}"
    -                         f"\n Domain datasets: {list_dom}")
    -
    -
    -
    [docs]def load_dataset(name: str = "INFO", - n: Optional[int] = None, - non_canonical_aa: Literal["remove", "keep", "gap"] = "remove", - min_len: Optional[int] = None, - max_len: Optional[int] = None) -> pd.DataFrame: - """ - Load protein benchmarking datasets. - - The benchmarks are distinguished into amino acid ('AA'), domain ('DOM'), and sequence ('SEQ') level - datasets. Use default settings (name='INFO') of an overview table. Detailed analysis is in :cite:`Breimann23a`. - - Parameters - ---------- - name - Name of the dataset. See 'Dataset' column in overview table. - n - Number of proteins per class. If None, the whole dataset will be returned. - non_canonical_aa - Options for modifying non-canonical amino acids: - - - 'remove': Remove sequences containing non-canonical amino acids. - - - 'keep': Do not remove sequences containing non-canonical amino acids. - - - 'gap': Non-canonical amino acids are replaced by gap symbol ('X'). - - min_len - Minimum length of sequences for filtering (disabled by default). - max_len - Maximum length of sequences for filtering (disabled by default). - - Returns - ------- - df_seq - Dataframe with the selected sequence dataset. - - Notes - ----- - See further information on the benchmark datasets in ref table. - - """ - ut.check_non_negative_number(name="n", val=n, accept_none=True) - ut.check_non_negative_number(name="min_len", val=min_len, accept_none=True) - check_name_of_dataset(name=name, folder_in=FOLDER_BENCHMARKS) - # Load overview table - if name == "INFO": - return pd.read_excel(FOLDER_BENCHMARKS + "INFO_benchmarks.xlsx") - df = pd.read_csv(FOLDER_BENCHMARKS + name + ".tsv", sep="\t") - # Filter Rdata - if min_len is not None: - mask = [len(x) >= min_len for x in df[ut.COL_SEQ]] - df = df[mask] - if max_len is not None: - mask = [len(x) <= max_len for x in df[ut.COL_SEQ]] - df = df[mask] - # Adjust non-canonical amino acid (keep, remove, or replace by gap) - df_seq = _adjust_non_canonical_aa(df=df, non_canonical_aa=non_canonical_aa) - # Select balanced groups - if n is not None: - labels = set(df_seq[ut.COL_LABEL]) - df_seq = pd.concat([df_seq[df_seq[ut.COL_LABEL] == l].head(n) for l in labels]) - return df_seq
    - - -# Load scales -def _filter_scales(df_cat=None, unclassified_in=False, just_aaindex=False): - """Filter scales for unclassified and aaindex scales""" - list_ids_not_in_aaindex = [x for x in df_cat[ut.COL_SCALE_ID] if "LINS" in x or "KOEH" in x] - list_ids_unclassified = [x for x, cat, sub_cat in zip(df_cat[ut.COL_SCALE_ID], df_cat[ut.COL_CAT], df_cat[ut.COL_SUBCAT]) - if "Unclassified" in sub_cat or cat == "Others"] - list_ids_to_exclude = [] - if not unclassified_in: - list_ids_to_exclude.extend(list_ids_unclassified) - if just_aaindex: - list_ids_to_exclude.extend(list_ids_not_in_aaindex) - df_cat = df_cat[~df_cat[ut.COL_SCALE_ID].isin(list_ids_to_exclude)] - return df_cat - - -# Extend for AAclustTop60 -
    [docs]def load_scales(name="scales", just_aaindex=False, unclassified_in=True): - """ - Load amino acid scales, scale classification (AAontology), or scale evaluation. - - A through analysis of the residue and sequence datasets can be found in TODO[Breimann23a]. - - Parameters - ---------- - name : str, default = 'scales' - Name of the dataset to load. Options are 'scales', 'scales_raw', 'scale_cat', - 'scales_pc', 'top60', and 'top60_eval'. - unclassified_in : bool, optional - Whether unclassified scales should be included. The 'Others' category counts as unclassified. - Only relevant if `name` is 'scales', 'scales_raw', or 'scale_classification'. - just_aaindex : bool, optional - Whether only scales provided from AAindex should be given. - Only relevant if `name` is 'scales', 'scales_raw', or 'scale_classification'. - - Returns - ------- - df : :class:`pandas.DataFrame` - Dataframe for the selected scale dataset. - """ - if name not in NAMES_SCALE_SETS: - raise ValueError(f"'name' ({name}) is not valid. Choose one of following: {NAMES_SCALE_SETS}") - # Load _data - df_cat = pd.read_excel(ut.FOLDER_DATA + f"{ut.STR_SCALE_CAT}.xlsx") - df_cat = _filter_scales(df_cat=df_cat, unclassified_in=unclassified_in, just_aaindex=just_aaindex) - if name == ut.STR_SCALE_CAT: - return df_cat - df = pd.read_excel(ut.FOLDER_DATA + name + ".xlsx", index_col=0) - # Filter scales - if name in NAME_SCALE_SETS_BASE: - df = df[[x for x in list(df) if x in list(df_cat[ut.COL_SCALE_ID])]] - return df
    -
    - -
    -
    - -
    -
    -
    -
    - - - - \ No newline at end of file diff --git a/docs/build/html/_modules/aaanalysis/dpulearn/dpulearn.html b/docs/build/html/_modules/aaanalysis/dpulearn/dpulearn.html deleted file mode 100644 index 8b39e1bb..00000000 --- a/docs/build/html/_modules/aaanalysis/dpulearn/dpulearn.html +++ /dev/null @@ -1,429 +0,0 @@ - - - - - - aaanalysis.dpulearn.dpulearn — AAanalysis - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - -
    - -
    -
    -
    - -
    -
    -
    -
    - -

    Source code for aaanalysis.dpulearn.dpulearn

    -"""
    -This is a script for deterministic Positive-Unlabeled (PU) Learning (dPULearn) class
    -"""
    -import numpy as np
    -import pandas as pd
    -from sklearn.metrics import pairwise_distances
    -from sklearn.decomposition import PCA
    -import math
    -import warnings
    -import aaanalysis.utils as ut
    -
    -# Settings
    -pd.set_option('expand_frame_repr', False)  # Single line print for pd.Dataframe
    -
    -LIST_METRICS = ['euclidean', 'manhattan', 'cosine']
    -
    -# TODO better example in fit
    -# TODO more check functions, improve with testing
    -
    -# I Helper Functions
    -# Check functions
    -def _check_metric(metric=None):
    -    """"""
    -    if metric is not None and metric not in LIST_METRICS:
    -        raise ValueError(f"'metric' ({metric}) should be None or one of following: {LIST_METRICS}")
    -
    -
    -def _check_df_seq(df_seq=None, col_class="class"):
    -    """"""
    -    if df_seq is not None:
    -        if col_class not in df_seq:
    -            columns = list(df_seq)
    -            raise ValueError(f"'col_class' ({col_class}) must be a column in 'df_seq': {columns}")
    -        if not df_seq.index.is_unique:
    -            df_seq = df_seq.reset_index(drop=True)
    -            warnings.warn("'df_seq' index was not unique. The index has been reset.", UserWarning)
    -    return df_seq
    -
    -
    -def _check_labels(labels=None, verbose=False, label_pos=None):
    -    # Check if labels is an array or list
    -    if not isinstance(labels, (list, np.ndarray)):
    -        raise TypeError(f"'labels' should be a list or a NumPy array, not {type(labels)}")
    -
    -    # Check if labels contain integers
    -    if not all(isinstance(label, int) for label in labels):
    -        raise ValueError("All elements in 'labels' should be integers")
    -    # Check if label_pos in labels
    -    if label_pos not in labels:
    -        str_error = f"'label_pos' ('{label_pos}', default=1) should be in 'labels' with ({list(np.unique(labels))})"
    -        raise ValueError(str_error)
    -    # Check if integers start with 0
    -    min_label = min(labels)
    -    if min_label != 0 and verbose:
    -        warnings.warn(f"The smallest label is {min_label}, typically should start with 0")
    -
    -    # Check if integers are consecutive
    -    unique_labels = sorted(set(labels))
    -    if any(unique_labels[i] - unique_labels[i - 1] != 1 for i in range(1, len(unique_labels))):
    -        if verbose:
    -            warnings.warn("Labels are not consecutive integers")
    -    if isinstance(labels, list):
    -        labels = np.array(labels)
    -    return labels
    -
    -
    -def _check_n_neg(labels=None, n_neg=None, label_pos=None, label_neg=None):
    -    """"""
    -    ut.check_non_negative_number(name='n_neg', val=n_neg, min_val=1)
    -    if sum([x == label_neg for x in labels]) > 0:
    -        raise ValueError(f"'labels' should not contain labels for negatives ({label_neg})")
    -    n_pos = sum([x == label_pos for x in labels])
    -    n_unl = sum([x != label_pos for x in labels])
    -    if n_pos < n_neg:
    -        raise ValueError(f"Number of positive labels ({n_pos}) should higher than 'n_neg' ({n_neg})")
    -    if n_unl < n_neg:
    -        raise ValueError(f"Number of unlabeled labels ({n_unl}) should higher than 'n_neg' ({n_neg})")
    -
    -
    -# Pre-processing helper functions
    -def _get_label_neg(labels=None):
    -    """"""
    -    label_neg = 0 if 0 not in labels else max(labels) + 1
    -    return label_neg
    -
    -
    -# II Main Functions
    -def _get_neg_via_distance(X=None, labels=None, metric="euclidean", n_neg=None,
    -                          df_seq=None, col_class=None,
    -                          label_neg=0, label_pos=1, name_neg=None):
    -    """Identify distant samples from positive mean as reliable negatives based on a specified distance metric.
    -
    -    Parameters:
    -    - X: np.ndarray, The input feature matrix of shape (n_samples, n_features).
    -    - labels: np.ndarray, Class labels for each sample.
    -    - metric: str, Distance metric ('euclidean', 'manhattan', etc.).
    -    - n_neg: int, Total number of negatives to identify.
    -    - df_seq: pd.DataFrame, Dataframe to store distance values.
    -    - col_class: str, Column name in df_seq to store class information.
    -    - label_neg, label_pos: int/str, Labels for the negative and positive classes.
    -    - name_neg: str, Prefix for naming identified negatives.
    -
    -    Returns:
    -    - new_labels: np.ndarray, Updated array of labels.
    -    - df_seq: pd.DataFrame, Dataframe with updated class information and distances.
    -    """
    -    mask_pos = labels == label_pos
    -    mask_unl = labels != label_pos
    -    # Compute the average distances to the positive datapoints
    -    avg_dist = pairwise_distances(X[mask_pos], X, metric=metric).mean(axis=0)
    -    # Select negatives based on largest average distance to positives
    -    top_indices = np.argsort(avg_dist[mask_unl])[::-1][:n_neg]
    -    new_labels = labels.copy()
    -    new_labels[top_indices] = label_neg
    -    # Update classes in df_seq and add average distance to positives
    -    if df_seq is not None:
    -        df_seq[metric] = avg_dist
    -        df_seq.loc[top_indices, col_class] = name_neg
    -    return new_labels, df_seq
    -
    -
    -def _get_neg_via_pca(X=None, labels=None, n_components=0.8, n_neg=None,
    -                     df_seq=None, col_class=None,
    -                     label_neg=0, label_pos=1, name_neg=None, **pca_kwargs):
    -    """Identify distant samples from positive mean as reliable negatives in PCA-compressed feature spaces.
    -
    -    Parameters:
    -    - X: np.ndarray, The input feature matrix of shape (n_samples, n_features).
    -    - labels: np.ndarray, Class labels for each sample.
    -    - n_components: float/int, Number of principal components or the ratio of total explained variance.
    -    - n_neg: int, Total number of negatives to identify.
    -    - df_seq: pd.DataFrame, Dataframe to store PCA values.
    -    - col_class: str, Column name in df_seq to store class information.
    -    - label_neg, label_pos: int/str, Labels for the negative and positive classes.
    -    - name_neg: str, Prefix for naming identified negatives.
    -    - pca_kwargs: dict, Additional keyword arguments for PCA.
    -
    -    Returns:
    -    - new_labels: np.ndarray, Updated array of labels.
    -    - df_seq: pd.DataFrame, Dataframe with updated class information.
    -    """
    -    # Principal component analysis
    -    pca = PCA(n_components=n_components, **pca_kwargs)
    -    pca.fit(X.T)
    -    list_exp_var = pca.explained_variance_ratio_
    -    _columns_pca = [f"PC{n+1} ({round(exp_var*100, 1)}%)" for n, exp_var in zip(range(len(list_exp_var)), list_exp_var)]
    -
    -    # Number of negatives based on explained variance
    -    _list_n_neg = [math.ceil(n_neg * x / sum(list_exp_var)) for x in list_exp_var]
    -    _list_n_cumsum = np.cumsum(np.array(_list_n_neg))
    -    list_n_neg = [n for n, cs in zip(_list_n_neg, _list_n_cumsum) if cs <= n_neg]
    -    if sum(list_n_neg) != n_neg:
    -        list_n_neg.append(n_neg - sum(list_n_neg))
    -    columns_pca = _columns_pca[0:len(list_n_neg)]
    -    df_seq[columns_pca] = pca.components_.T[:, 0:len(columns_pca)]
    -
    -    # Get mean of positive datafor each component
    -    mask_pos = labels == label_pos
    -    mask_unl = labels != label_pos
    -    pc_means = df_seq[mask_pos][columns_pca].mean(axis=0)
    -
    -    # Select negatives based on absolute difference to mean of positives for each component
    -    new_labels = labels.copy()
    -    _df = df_seq.copy()
    -    for col_pc, mean_pc, n in zip(columns_pca, pc_means, list_n_neg):
    -        name_reg_pc = f"{name_neg}_{col_pc.split(' ')[0]}"
    -        col_dif = f"{col_pc}_abs_dif"
    -
    -        # Calculate absolute difference to the mean for each sample in the component
    -        _df[col_dif] = np.abs(df_seq[col_pc] - mean_pc)
    -
    -        # Sort and take top n indices
    -        top_indices = _df.loc[mask_unl].sort_values(by=col_dif).tail(n).index
    -
    -        # Update labels and masks
    -        new_labels[top_indices] = label_neg
    -        mask_unl[top_indices] = False
    -
    -        # Update classes in df_seq
    -        if df_seq is not None:
    -            df_seq.loc[top_indices, col_class] = name_reg_pc
    -    return new_labels, df_seq
    -
    -
    -
    [docs]class dPULearn: - """ - Deterministic Positive-Unlabeled (dPULearn) model. - - dPULearn offers a deterministic approach for Positive-Unlabeled (PU) learning. The model primarily employs - Principal Component Analysis (PCA) to reduce the dimensionality of the feature space. Based on the most - informative principal components (PCs), it then iteratively identifies reliable negatives from the set of - unlabeled samples. These reliable negatives are those that are most distant from the positive samples in - the feature space. Alternatively, reliable negatives can also be identified using distance metrics like - Euclidean, Manhattan, or Cosine distance if specified. - - Parameters - ---------- - verbose : bool, default=False - Enable verbose output. - n_components : float or int, default=0.80 - Number of components to cover a maximum percentage of total variance when PCA is applied. - pca_kwargs : dict, default=None - Additional keyword arguments to pass to PCA. - metric : {'euclidean', 'manhattan', 'cosine'} or None, default=None - The distance metric to use. If None, PCA-based identification is used. - If a metric is specified, distance-based identification is performed. - - Attributes - ---------- - labels_ : array-like, shape (n_samples,) - Labels of each datapoint. - - Notes - ----- - - The method is inspired by deterministic PU learning techniques and follows - an information-theoretic PU learning approach. - - If `metric` is specified, distance-based identification of reliable negatives is performed. - Otherwise, PCA-based identification is used. - - Cosine metric is recommended in high-dimensional spaces. - - """ -
    [docs] def __init__(self, verbose=False, n_components=0.80, pca_kwargs=None, metric=None): - self.verbose = verbose - # Arguments for Principal Component Analysis (PCA)-based identification - self.n_components = n_components - if pca_kwargs is None: - pca_kwargs = dict() - self.pca_kwargs = pca_kwargs - # Arguments for distance-based identification - _check_metric(metric=metric) - self.metric = metric - # Output parameters (will be set during model fitting) - self.labels_ = None
    - - # Main method -
    [docs] def fit(self, X, labels=None, n_neg=0, label_pos=1, name_neg="REL_NEG", df_seq=None, col_class="class"): - """ - Fit the dPULearn model to identify reliable negative samples - from the provided feature matrix and labels. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Feature matrix where `n_samples` is the number of samples and `n_features` is the number of features. - labels : array-like, shape (n_samples,), default=None - Array of labels; positive samples should be indicated by `label_pos`. - n_neg : int, default=0 - Number of negative samples to identify. - label_pos : int or str, default=1 - Label indicating positive samples in the `labels` array. - name_neg : str, default="REL_NEG" - Name to assign to the newly identified negative samples. - df_seq : DataFrame, default=None, optional - DataFrame containing sequences; will be updated with new negative samples. - col_class : str, default="class" - Column name in `df_seq` where the class labels are stored. - - Returns - ------- - df_seq : DataFrame - DataFrame with the newly identified reliable negatives. Will be None if not provided. - - Notes - ----- - Distance-based identification is used if `metric` is specified during class initialization. - - Examples - -------- - Create small example datafor dPUlearn containg positive ('pos', 1) and unlabeled ('unl', 2) data - - >>> import aaanalysis as aa - >>> import pandas as pd - >>> import numpy as np - >>> X = np.array([[0.2, 0.1], [0.3, 0.2], [0.2, 0.3], [0.5, 0.7]]) - >>> labels = np.array([1, 2, 2, 2]) - >>> df_seq = pd.DataFrame({ - ... 'sequence': ['ATGC', 'GCTA', 'ACTG', 'TACG'], - ... 'class': ['pos', 'unl', 'unl', 'unl']}) - - Use dPULearn in default mode (PC-based identification) and modify df_seq automatically - - >>> dpul = aa.dPULearn() - >>> n_neg = 2 - >>> df_seq = dpul.fit(X=X, df_seq=df_seq, labels=labels, n_neg=n_neg) - >>> labels = dpul.labels_ # Updated labels - - """ - ut.check_feat_matrix(X=X, labels=labels) - df_seq = _check_df_seq(df_seq=df_seq, col_class=col_class) - labels = _check_labels(labels=labels, verbose=self.verbose, label_pos=label_pos) - label_neg = _get_label_neg(labels=labels) - _check_n_neg(labels=labels, n_neg=n_neg, label_neg=label_neg, label_pos=label_pos) - # Compute average distance for threshold-based filtering (Yang et al., 2012, 2014; Nan et al. 2017) - args = dict(X=X, labels=labels, n_neg=n_neg, - df_seq=df_seq, col_class=col_class, - label_neg=label_neg, label_pos=label_pos, name_neg=name_neg) - if self.metric is not None: - new_labels, df_seq = _get_neg_via_distance(**args, metric=self.metric) - # Identify most far away negatives in PCA compressed feature space - else: - new_labels, df_seq = _get_neg_via_pca(**args, n_components=self.n_components, **self.pca_kwargs) - # Set new labels - self.labels_ = new_labels - return df_seq
    - -
    [docs] def eval(self): - """""" # TODO add evaluation function
    -
    - -
    -
    - -
    -
    -
    -
    - - - - \ No newline at end of file diff --git a/docs/build/html/_modules/aaanalysis/plotting/plotting_functions.html b/docs/build/html/_modules/aaanalysis/plotting/plotting_functions.html deleted file mode 100644 index 0ddfc6a9..00000000 --- a/docs/build/html/_modules/aaanalysis/plotting/plotting_functions.html +++ /dev/null @@ -1,556 +0,0 @@ - - - - - - aaanalysis.plotting.plotting_functions — AAanalysis - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - -
    - -
    -
    -
    -
      -
    • - - -
    • -
    • -
    -
    -
    -
    -
    - -

    Source code for aaanalysis.plotting.plotting_functions

    -#! /usr/bin/python3
    -"""
    -Default plotting functions
    -"""
    -import seaborn as sns
    -import matplotlib as mpl
    -import matplotlib.pyplot as plt
    -import aaanalysis.utils as ut
    -
    -
    -
    -LIST_AA_COLOR_PALETTES = ["FEAT", "SHAP", "GGPLOT"]
    -LIST_AA_COLOR_DICTS = ["DICT_SCALE_CAT", "DICT_COLOR"]
    -LIST_AA_COLORS = LIST_AA_COLOR_PALETTES + LIST_AA_COLOR_DICTS
    -
    -LIST_FONTS = ['Arial', 'Avant Garde', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'DejaVu Sans',
    -              'Geneva', 'Helvetica', 'Lucid', 'Lucida Grande', 'Verdana']
    -
    -
    -# Helper functions
    -def check_font_style(font="Arial"):
    -    """"""
    -    if font not in LIST_FONTS:
    -        error_message = f"'font' ({font}) not in recommended fonts: {LIST_FONTS}. Set font manually by:" \
    -                        f"\n\tplt.rcParams['font.sans-serif'] = '{font}'"
    -        raise ValueError(error_message)
    -
    -
    -def check_fig_format(fig_format="pdf"):
    -    """"""
    -    list_fig_formats = ['eps', 'jpg', 'jpeg', 'pdf', 'pgf', 'png', 'ps',
    -                        'raw', 'rgba', 'svg', 'svgz', 'tif', 'tiff', 'webp']
    -    ut.check_str(name="fig_format", val=fig_format)
    -    if fig_format not in list_fig_formats:
    -        raise ValueError(f"'fig_format' should be one of following: {list_fig_formats}")
    -
    -
    -def check_grid_axis(grid_axis="y"):
    -    list_grid_axis = ["y", "x", "both"]
    -    if grid_axis not in list_grid_axis:
    -        raise ValueError(f"'grid_axis' ({grid_axis}) should be one of following: {list_grid_axis}")
    -
    -
    -def check_cats(list_cat=None, dict_color=None, labels=None):
    -    """"""
    -    ut.check_dict(name="dict_color", val=dict_color, accept_none=False)
    -    if labels is not None:
    -        if list_cat is not None:
    -            if len(list_cat) != len(labels):
    -                raise ValueError(f"Length of 'list_cat' ({len(list_cat)}) and 'labels' ({len(labels)}) must match")
    -        elif len(dict_color) != len(labels):
    -            raise ValueError(f"Length of 'dict_color' ({len(dict_color)}) and 'labels' ({len(labels)}) must match")
    -    if list_cat is None:
    -        list_cat = list(dict_color.keys())
    -    else:
    -        raise ValueError("'list_cat' and 'dict_color' should not be None")
    -    return list_cat
    -
    -
    -# Get color maps
    -def _get_shap_cmap(n_colors=100, facecolor_dark=True):
    -    """Generate a diverging color map for feature values."""
    -    n = 20
    -    cmap_low = sns.light_palette(ut.COLOR_SHAP_NEG, input="hex", reverse=True, n_colors=int(n_colors/2)+n)
    -    cmap_high = sns.light_palette(ut.COLOR_SHAP_POS, input="hex", n_colors=int(n_colors/2)+n)
    -    c_middle = [(0, 0, 0)] if facecolor_dark else [cmap_low[-1]]
    -    cmap = cmap_low[0:-n] + c_middle + cmap_high[n:]
    -    return cmap
    -
    -
    -def _get_feat_cmap(n_colors=100, facecolor_dark=False):
    -    """Generate a diverging color map for feature values."""
    -    n = 5
    -    cmap = sns.color_palette("RdBu_r", n_colors=n_colors + n * 2)
    -    cmap_low, cmap_high = cmap[0:int((n_colors + n * 2) / 2)], cmap[int((n_colors + n * 2) / 2):]
    -    c_middle = [(0, 0, 0)] if facecolor_dark else [cmap_low[-1]]
    -    cmap = cmap_low[0:-n] + c_middle + cmap_high[n:]
    -    return cmap
    -
    -
    -def _get_ggplot_cmap(n_colors=100):
    -    """Generate a circular GGplot color palette."""
    -    cmap = sns.color_palette("husl", n_colors)
    -    return cmap
    -
    -
    -def _get_default_colors(name=None, n_colors=100, facecolor_dark=True):
    -    """Retrieve default color maps based on palette name."""
    -    args = dict(n_colors=n_colors, facecolor_dark=facecolor_dark)
    -    if name == "SHAP":
    -        return _get_shap_cmap(**args)
    -    elif name == "FEAT":
    -        return _get_feat_cmap(**args)
    -    elif name == "GGPLOT":
    -        return _get_ggplot_cmap(n_colors=n_colors)
    -
    -
    -def _get_cmap_with_gap(n_colors=100, color_pos=None, color_neg=None, color_center=None, pct_gap=10, pct_center=None,
    -                       input="hex"):
    -    """Generate a custom color map with a gap."""
    -    n_gap = int(n_colors*pct_gap/2)
    -    cmap_pos = sns.light_palette(color_pos, input=input, n_colors=int(n_colors/2)+n_gap)
    -    cmap_neg = sns.light_palette(color_neg, input=input, reverse=True, n_colors=int(n_colors/2)+n_gap)
    -    color_center = [cmap_neg[-1]] if color_center is None else color_center
    -    color_center = [color_center] if type(color_center) is str else color_center
    -    if pct_center is None:
    -        cmap = cmap_neg[0:-n_gap] + color_center + cmap_pos[n_gap:]
    -    else:
    -        n_center = int(n_colors * pct_center)
    -        n_gap += int(n_center/2)
    -        cmap = cmap_neg[0:-n_gap] + color_center * n_center + cmap_pos[n_gap:]
    -    return cmap
    -
    -
    -# Default plotting function
    -
    [docs]def plot_get_cmap(name=None, n_colors=100, facecolor_dark=False, - color_pos=None, color_neg=None, color_center=None, - input="hex", pct_gap=10, pct_center=None): - """ - Retrieve color maps or color dictionaries specified for AAanalysis. - - Parameters - ---------- - name : str, optional - The name of the color palette to use in AAanalysis. Options include: - - 'SHAP', 'FEAT', 'GGPLOT': Return color maps for SHAP plots, CPP feature maps/heatmaps, - and datagrouping as in GGplot, respectively. - - 'DICT_COLOR', 'DICT_SCALE_CAT': Return default color dictionaries for plots (e.g., bars in CPPPlot.profile) - and scale categories (e.g., CPPPlot.heatmap), respectively. - n_colors : int, default=100 - Number of colors in the color map. - facecolor_dark : bool, default=False - Whether to use a dark face color for 'SHAP' and 'FEAT'. - color_pos : str, optional - Hex code for the positive color. - color_neg : str, optional - Hex code for the negative color. - color_center : str or list, optional - Hex code or list for the center color. - input : str, {'rgb', 'hls', 'husl', 'xkcd'} - Color space to interpret the input color. The first three options - apply to tuple inputs and the latter applies to string inputs. - pct_gap : int, default=10 - Percentage size of the gap between color ranges. - pct_center : float, optional - Percentage size of the center color in the map. - - Returns - ------- - cmap : list or dict - If 'name' parameter is 'SHAP', 'FEAT', or 'GGPLOT', a list of colors specified for AAanalysis will be returned. - If 'name' parameter is None, a list of colors based on provided colors - - See Also - -------- - sns.color_palette : Function to generate a color palette in seaborn. - sns.light_palette : Function to generate a lighter color palette in seaborn. - """ - # TODO check color dict name - if name in LIST_AA_COLOR_PALETTES: - cmap = _get_default_colors(name=name, n_colors=n_colors, facecolor_dark=facecolor_dark) - return cmap - cmap = _get_cmap_with_gap(n_colors=n_colors, color_pos=color_pos, color_neg=color_neg, - color_center=color_center, pct_gap=pct_gap, pct_center=pct_center, - input=input) - return cmap
    - - -
    [docs]def plot_get_cdict(name=None): - """ - Retrieve color dictionaries specified for AAanalysis. - - Parameters - ---------- - name : str, {'DICT_COLOR', 'DICT_SCALE_CAT'} - The name of default color dictionaries for plots (e.g., bars in CPPPlot.profile) - and scale categories (e.g., CPPPlot.heatmap), respectively. - - Returns - ------- - cmap : dict - Specific AAanalysis color dictionary. - """ - # TODO check color dict name - color_dict = ut.DICT_COLOR if name == "DICT_COLORS" else ut.DICT_COLOR_CAT - return color_dict
    - - -
    [docs]def plot_settings(fig_format="pdf", verbose=False, grid=False, grid_axis="y", - font_scale=0.7, font="Arial", - change_size=True, weight_bold=True, adjust_elements=True, - short_ticks=False, no_ticks=False, - no_ticks_y=False, short_ticks_y=False, no_ticks_x=False, short_ticks_x=False): - """ - Configure general settings for plot visualization with various customization options. - - Parameters - ---------- - fig_format : str, default='pdf' - Specifies the file format for saving the plot. - verbose : bool, default=False - If True, enables verbose output. - grid : bool, default=False - If True, makes the grid visible. - grid_axis : str, default='y' - Choose the axis ('y', 'x', 'both') to apply the grid to. - font_scale : float, default=0.7 - Sets the scale for font sizes in the plot. - font : str, default='Arial' - Name of sans-serif font (e.g., 'Arial', 'Verdana', 'Helvetica', 'DejaVu Sans') - change_size : bool, default=True - If True, adjusts the size of plot elements. - weight_bold : bool, default=True - If True, text elements appear in bold. - adjust_elements : bool, default=True - If True, makes additional visual and layout adjustments to the plot. - short_ticks : bool, default=False - If True, uses short tick marks. - no_ticks : bool, default=False - If True, removes all tick marks. - no_ticks_y : bool, default=False - If True, removes tick marks on the y-axis. - short_ticks_y : bool, default=False - If True, uses short tick marks on the y-axis. - no_ticks_x : bool, default=False - If True, removes tick marks on the x-axis. - short_ticks_x : bool, default=False - If True, uses short tick marks on the x-axis. - - Notes - ----- - This function modifies the global settings of Matplotlib and Seaborn libraries. - - Examples - -------- - >>> import aaanalysis as aa - >>> aa.plot_settings(fig_format="pdf", font_scale=1.0, weight_bold=False) - """ - # Check input - check_fig_format(fig_format=fig_format) - check_font_style(font=font) - check_grid_axis(grid_axis=grid_axis) - args_bool = {"verbose": verbose, "grid": grid, "change_size": change_size, "weight_bold": weight_bold, - "adjust_elements": adjust_elements, - "short_ticks": short_ticks, "no_ticks": no_ticks, "no_ticks_y": no_ticks_y, - "short_ticks_y": short_ticks_y, "no_ticks_x": no_ticks_x, "short_ticks_x": short_ticks_x} - for key in args_bool: - ut.check_bool(name=key, val=args_bool[key]) - ut.check_non_negative_number(name="font_scale", val=font_scale, min_val=0, just_int=False) - - # Set embedded fonts in PDF - mpl.rcParams.update(mpl.rcParamsDefault) - mpl.rcParams["pdf.fonttype"] = 42 - mpl.rcParams["pdf.fonttype"] = 42 - if verbose: - print(plt.rcParams.keys) # Print all plot settings that can be modified in general - if not change_size: - plt.rcParams["font.family"] = "sans-serif" - plt.rcParams["font.sans-serif"] = font - mpl.rc('font', **{'family': font}) - return - sns.set_context("talk", font_scale=font_scale) # Font settings https://matplotlib.org/3.1.1/tutorials/text/text_props.html - plt.rcParams["font.family"] = "sans-serif" - plt.rcParams["font.sans-serif"] = font - if weight_bold: - plt.rcParams["axes.labelweight"] = "bold" - plt.rcParams["axes.titleweight"] = "bold" - else: - plt.rcParams["axes.linewidth"] = 1 - plt.rcParams["xtick.major.width"] = 0.8 - plt.rcParams["xtick.minor.width"] = 0.6 - plt.rcParams["ytick.major.width"] = 0.8 - plt.rcParams["ytick.minor.width"] = 0.6 - if short_ticks: - plt.rcParams["xtick.major.size"] = 3.5 - plt.rcParams["xtick.minor.size"] = 2 - plt.rcParams["ytick.major.size"] = 3.5 - plt.rcParams["ytick.minor.size"] = 2 - if short_ticks_x: - plt.rcParams["xtick.major.size"] = 3.5 - plt.rcParams["xtick.minor.size"] = 2 - if short_ticks_y: - plt.rcParams["ytick.major.size"] = 3.5 - plt.rcParams["ytick.minor.size"] = 2 - if no_ticks: - plt.rcParams["xtick.major.size"] = 0 - plt.rcParams["xtick.minor.size"] = 0 - plt.rcParams["ytick.major.size"] = 0 - plt.rcParams["ytick.minor.size"] = 0 - if no_ticks_x: - plt.rcParams["xtick.major.size"] = 0 - plt.rcParams["xtick.minor.size"] = 0 - if no_ticks_y: - plt.rcParams["ytick.major.size"] = 0 - plt.rcParams["ytick.minor.size"] = 0 - - plt.rcParams["axes.labelsize"] = 17 #13.5 - plt.rcParams["axes.titlesize"] = 16.5 #15 - if fig_format == "pdf": - mpl.rcParams['pdf.fonttype'] = 42 - elif "svg" in fig_format: - mpl.rcParams['svg.fonttype'] = 'none' - font = {'family': font, "weight": "bold"} if weight_bold else {"family": font} - mpl.rc('font', **font) - if adjust_elements: - # Error bars - plt.rcParams["errorbar.capsize"] = 10 # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.errorbar.html - # Grid - plt.rcParams["axes.grid.axis"] = grid_axis # 'y', 'x', 'both' - plt.rcParams["axes.grid"] = grid - # Legend - plt.rcParams["legend.frameon"] = False - plt.rcParams["legend.fontsize"] = "medium" #"x-small" - plt.rcParams["legend.loc"] = 'upper right' # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.legend.html
    - - -
    [docs]def plot_gcfs(): - """Get current font size, which is set by ut.plot_settings function""" - # Get the current plotting context - current_context = sns.plotting_context() - font_size = current_context['font.size'] - return font_size
    - - -
    [docs]def plot_set_legend(ax=None, handles=None, dict_color=None, list_cat=None, labels=None, y=-0.2, x=0.5, ncol=3, - fontsize=11, weight="normal", lw=0, edgecolor=None, return_handles=False, loc="upper left", - labelspacing=0.2, columnspacing=1, title=None, fontsize_legend=None, title_align_left=True, - fontsize_weight="normal", shape=None, **kwargs): - """ - Set a customizable legend for a plot. - - Parameters - ---------- - ax : matplotlib.axes.Axes, default=None - The axes to attach the legend to. - handles : list, default=None - Handles for legend items. - dict_color : dict, default=None - A dictionary mapping categories to colors. - list_cat : list, default=None - List of categories to include in the legend. - labels : list, default=None - Labels for legend items. - y : float, default=-0.2 - The y-coordinate for the legend's anchor point. - x : float, default=0.5 - The x-coordinate for the legend's anchor point. - ncol : int, default=3 - Number of columns in the legend. - fontsize : int, default=11 - Font size for the legend text. - weight : str, default='normal' - Weight of the font. - lw : float, default=0 - Line width for legend items. - edgecolor : color, default=None - Edge color for legend items. - return_handles : bool, default=False - Whether to return handles and labels. - loc : str, default='upper left' - Location for the legend. - labelspacing : float, default=0.2 - Vertical spacing between legend items. - columnspacing : int, default=1 - Horizontal spacing between legend columns. - title : str, default=None - Title for the legend. - fontsize_legend : int, default=None - Font size for the legend title. - title_align_left : bool, default=True - Whether to align the title to the left. - fontsize_weight : str, default='normal' - Font weight for the legend title. - shape : str, default=None - Marker shape for legend items. - **kwargs : dict - Additional arguments passed directly to ax.legend() for finer control. - - Returns - ------- - ax : matplotlib.axes.Axes - The axes with the legend applied. - - See Also - -------- - matplotlib.pyplot.legend : For additional details on how the 'loc' parameter can be customized. - matplotlib.lines.Line2D : For additional details on the different types of marker shapes ('shape' parameter). - - Examples - -------- - >>> import aaanalysis as aa - >>> aa.plot_set_legend(ax=ax, dict_color={'Cat1': 'red', 'Cat2': 'blue'}, shape='o') - """ - # Check input - if ax is None: - ax = plt.gca() - list_cat = check_cats(list_cat=list_cat, dict_color=dict_color, labels=labels) - args_float = {"y": y, "x": x, "lw": lw, "labelspacing": labelspacing, - "columnspacing": columnspacing} - for key in args_float: - ut.check_float(name=key, val=args_float[key]) - ut.check_non_negative_number(name="ncol", val=ncol, min_val=1, just_int=True, accept_none=False) - ut.check_non_negative_number(name="ncol", val=ncol, min_val=0, just_int=False, accept_none=True) - ut.check_bool(name="return_handles", val=return_handles) - ut.check_bool(name="title_align_left", val=title_align_left) - # TODO check other args - # Prepare the legend handles - dict_leg = {cat: dict_color[cat] for cat in list_cat} - # Generate function for legend markers based on provided shape - if shape is None: - if edgecolor is None: - f = lambda l, c: mpl.patches.Patch(facecolor=l, label=c, lw=lw, edgecolor=l) - else: - f = lambda l, c: mpl.patches.Patch(facecolor=l, label=c, lw=lw, edgecolor=edgecolor) - else: - f = lambda l, c: plt.Line2D([0], [0], marker=shape, color='w', markerfacecolor=l, markersize=10, label=c) - # Create handles if not provided - handles = [f(l, c) for c, l in dict_leg.items()] if handles is None else handles - # Return handles and labels if required - if return_handles: - return handles, labels - # Prepare labels and args - if labels is None: - labels = list(dict_leg.keys()) - args = dict(prop={"weight": weight, "size": fontsize}, **kwargs) - if fontsize_legend is not None: - args["title_fontproperties"] = {"weight": fontsize_weight, "size": fontsize_legend} - # Create the legend - legend = ax.legend(handles=handles, labels=labels, bbox_to_anchor=(x, y), ncol=ncol, loc=loc, - labelspacing=labelspacing, columnspacing=columnspacing, borderpad=0, **args, title=title) - # Align the title if required - if title_align_left: - legend._legend_box.align = "left" - return ax
    -
    - -
    -
    - -
    -
    -
    -
    - - - - \ No newline at end of file diff --git a/docs/build/html/_modules/index.html b/docs/build/html/_modules/index.html deleted file mode 100644 index 369b1ebf..00000000 --- a/docs/build/html/_modules/index.html +++ /dev/null @@ -1,128 +0,0 @@ - - - - - - Overview: module code — AAanalysis - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - -
    - -
    -
    -
    -
      -
    • - -
    • -
    • -
    -
    -
    - - -
    -
    -
    -
    - - - - \ No newline at end of file diff --git a/docs/build/html/_static/css/style.css b/docs/build/html/_static/css/style.css index b8bfd848..b6f89b7b 100644 --- a/docs/build/html/_static/css/style.css +++ b/docs/build/html/_static/css/style.css @@ -32,6 +32,13 @@ background: #f7f7f7; /* Sets the background color */ } +/* Style for inline code */ +.rst-content code { + background-color: #f5f5f5; /* Gray background */ + font-family: monospace; /* Monospace font */ + padding: 2px 2px; /* Padding around the text */ +} + /* Style for the search input box in the sidebar */ .wy-side-nav-search input[type=text] { border-color: #666666; /* Sets the border color */ @@ -67,4 +74,4 @@ html.writer-html4 .rst-content dl:not(.docutils)>dt, html.writer-html5 .rst-cont /* Style for vertical menu items */ .wy-menu-vertical a { color: #d9d9d9; /* Sets the text color */ -} +} \ No newline at end of file diff --git a/docs/build/html/api.html b/docs/build/html/api.html index fac04f60..e5cfd2c3 100644 --- a/docs/build/html/api.html +++ b/docs/build/html/api.html @@ -125,7 +125,7 @@
  • - View page source + Edit on GitHub

  • diff --git a/docs/build/html/generated/aaanalysis.AAclust.html b/docs/build/html/generated/aaanalysis.AAclust.html index a2a5bc31..51dce869 100644 --- a/docs/build/html/generated/aaanalysis.AAclust.html +++ b/docs/build/html/generated/aaanalysis.AAclust.html @@ -110,7 +110,7 @@
  • - View page source + Edit on GitHub

  • @@ -122,7 +122,7 @@

    aaanalysis.AAclust

    -class aaanalysis.AAclust(model=None, model_kwargs=None, verbose=False)[source][source]
    +class aaanalysis.AAclust(model=None, model_kwargs=None, verbose=False)[source]

    Bases: object

    AAclust: A k-optimized clustering framework for selecting redundancy-reduced set of numerical scales.

    AAclust is designed primarily for amino acid scales but is versatile enough for any set of numerical indices. @@ -218,7 +218,7 @@

    aaanalysis.AAclust
    -__init__(model=None, model_kwargs=None, verbose=False)[source][source]
    +__init__(model=None, model_kwargs=None, verbose=False)[source]

    Methods

    @@ -253,7 +253,7 @@

    aaanalysis.AAclust
    -fit(X, names=None, on_center=True, min_th=0, merge_metric='euclidean', n_clusters=None)[source][source]
    +fit(X, names=None, on_center=True, min_th=0, merge_metric='euclidean', n_clusters=None)[source]

    Fit the AAclust model on the data, optimizing cluster formation using Pearson correlation.

    AAclust determines the optimal number of clusters, k, without pre-specification. It partitions data(X) into clusters by maximizing the within-cluster Pearson correlation beyond the ‘min_th’ threshold. The quality of @@ -294,7 +294,7 @@

    aaanalysis.AAclust
    -cluster_naming(names=None, labels=None, name_unclassified='Unclassified')[source][source]
    +cluster_naming(names=None, labels=None, name_unclassified='Unclassified')[source]

    Assigns names to clusters based on scale names and their frequency.

    This method renames clusters based on the names of the scales in each cluster, with priority given to the most frequent scales. If the name is already used or does not exist, it defaults to ‘name_unclassified’.

    @@ -317,7 +317,7 @@

    aaanalysis.AAclust
    -static get_cluster_centers(X, labels=None)[source][source]
    +static get_cluster_centers(X, labels=None)[source]

    Computes the center of each cluster based on the given labels.

    Parameters
    @@ -338,7 +338,7 @@

    aaanalysis.AAclust
    -static get_cluster_medoids(X, labels=None)[source][source]
    +static get_cluster_medoids(X, labels=None)[source]

    Computes the medoid of each cluster based on the given labels.

    Parameters
    @@ -363,7 +363,7 @@

    aaanalysis.AAclust
    -static correlation(X_test, X_ref, labels_test=None, labels_ref=None, n=3, positive=True, on_center=False, except_unclassified=True)[source][source]
    +static correlation(X_test, X_ref, labels_test=None, labels_ref=None, n=3, positive=True, on_center=False, except_unclassified=True)[source]

    Computes the correlation of test data with reference cluster centers.

    Parameters
    @@ -389,7 +389,7 @@

    aaanalysis.AAclust
    -eval()[source][source]
    +eval()[source]

    diff --git a/docs/build/html/generated/aaanalysis.CPP.html b/docs/build/html/generated/aaanalysis.CPP.html index 95d79ce7..563d682a 100644 --- a/docs/build/html/generated/aaanalysis.CPP.html +++ b/docs/build/html/generated/aaanalysis.CPP.html @@ -110,7 +110,7 @@
  • - View page source + Edit on GitHub

  • @@ -122,7 +122,7 @@

    aaanalysis.CPP

    -class aaanalysis.CPP(df_scales=None, df_cat=None, df_parts=None, split_kws=None, accept_gaps=False, verbose=True)[source][source]
    +class aaanalysis.CPP(df_scales=None, df_cat=None, df_parts=None, split_kws=None, accept_gaps=False, verbose=True)[source]

    Bases: Tool

    Create and filter features that are most discriminant between two sets of sequences.

    @@ -143,7 +143,7 @@

    aaanalysis.CPP
    -__init__(df_scales=None, df_cat=None, df_parts=None, split_kws=None, accept_gaps=False, verbose=True)[source][source]
    +__init__(df_scales=None, df_cat=None, df_parts=None, split_kws=None, accept_gaps=False, verbose=True)[source]

    Methods

    @@ -166,7 +166,7 @@

    aaanalysis.CPP
    -run(labels=None, parametric=False, n_filter=100, tmd_len=20, jmd_n_len=10, jmd_c_len=10, ext_len=4, start=1, check_cat=True, n_pre_filter=None, pct_pre_filter=5, max_std_test=0.2, max_overlap=0.5, max_cor=0.5, n_processes=None)[source][source]
    +run(labels=None, parametric=False, n_filter=100, tmd_len=20, jmd_n_len=10, jmd_c_len=10, ext_len=4, start=1, check_cat=True, n_pre_filter=None, pct_pre_filter=5, max_std_test=0.2, max_overlap=0.5, max_cor=0.5, n_processes=None)[source]

    Perform CPP pipeline by creation and two-step filtering of features. CPP aims to identify a collection of non-redundant features that are most discriminant between a test and a reference group of sequences.

    @@ -221,7 +221,7 @@

    aaanalysis.CPP
    -static eval(df_feat=None, features=None)[source][source]
    +static eval(df_feat=None, features=None)[source]

    Get evaluation for provided dataset

    diff --git a/docs/build/html/generated/aaanalysis.CPPPlot.html b/docs/build/html/generated/aaanalysis.CPPPlot.html index db6286ab..9c4c9cf0 100644 --- a/docs/build/html/generated/aaanalysis.CPPPlot.html +++ b/docs/build/html/generated/aaanalysis.CPPPlot.html @@ -110,7 +110,7 @@
  • - View page source + Edit on GitHub

  • @@ -122,7 +122,7 @@

    aaanalysis.CPPPlot

    -class aaanalysis.CPPPlot(df_cat=None, accept_gaps=False, jmd_n_len=10, jmd_c_len=10, ext_len=4, verbose=True)[source][source]
    +class aaanalysis.CPPPlot(df_cat=None, accept_gaps=False, jmd_n_len=10, jmd_c_len=10, ext_len=4, verbose=True)[source]

    Bases: object

    Create and filter features that are most discriminant between two sets of sequences.

    @@ -139,7 +139,7 @@

    aaanalysis.CPPPlot
    -__init__(df_cat=None, accept_gaps=False, jmd_n_len=10, jmd_c_len=10, ext_len=4, verbose=True)[source][source]
    +__init__(df_cat=None, accept_gaps=False, jmd_n_len=10, jmd_c_len=10, ext_len=4, verbose=True)[source]

    Methods

    @@ -165,7 +165,7 @@

    aaanalysis.CPPPlot
    -profile(df_feat=None, y='category', val_col='mean_dif', val_type='count', normalize=False, figsize=(7, 5), title=None, title_kws=None, dict_color=None, edge_color='none', bar_width=0.75, add_jmd_tmd=True, tmd_len=20, start=1, jmd_n_seq=None, tmd_seq=None, jmd_c_seq=None, tmd_color='mediumspringgreen', jmd_color='blue', tmd_seq_color='black', jmd_seq_color='white', seq_size=None, tmd_jmd_fontsize=None, xtick_size=11.0, xtick_width=2.0, xtick_length=5.0, xticks_pos=False, ytick_size=None, ytick_width=2.0, ytick_length=5.0, ylim=None, highlight_tmd_area=True, highlight_alpha=0.15, grid=False, grid_axis='both', add_legend_cat=True, legend_kws=None, shap_plot=False, **kwargs)[source][source]
    +profile(df_feat=None, y='category', val_col='mean_dif', val_type='count', normalize=False, figsize=(7, 5), title=None, title_kws=None, dict_color=None, edge_color='none', bar_width=0.75, add_jmd_tmd=True, tmd_len=20, start=1, jmd_n_seq=None, tmd_seq=None, jmd_c_seq=None, tmd_color='mediumspringgreen', jmd_color='blue', tmd_seq_color='black', jmd_seq_color='white', seq_size=None, tmd_jmd_fontsize=None, xtick_size=11.0, xtick_width=2.0, xtick_length=5.0, xticks_pos=False, ytick_size=None, ytick_width=2.0, ytick_length=5.0, ylim=None, highlight_tmd_area=True, highlight_alpha=0.15, grid=False, grid_axis='both', add_legend_cat=True, legend_kws=None, shap_plot=False, **kwargs)[source]

    Plot feature profile for given features from ‘df_feat’.

    Parameters
    @@ -222,7 +222,7 @@

    aaanalysis.CPPPlot
    -heatmap(df_feat=None, y='subcategory', val_col='mean_dif', val_type='mean', normalize=False, figsize=(8, 5), title=None, title_kws=None, vmin=None, vmax=None, grid_on=True, cmap='RdBu_r', cmap_n_colors=None, dict_color=None, cbar_kws=None, facecolor_dark=False, add_jmd_tmd=True, tmd_len=20, start=1, jmd_n_seq=None, tmd_seq=None, jmd_c_seq=None, tmd_color='mediumspringgreen', jmd_color='blue', tmd_seq_color='black', jmd_seq_color='white', seq_size=None, tmd_jmd_fontsize=None, xticks_pos=False, xtick_size=11.0, xtick_width=2.0, xtick_length=5.0, ytick_size=None, add_legend_cat=True, legend_kws=None, add_importance_map=False, cbar_pct=False, **kwargs)[source][source]
    +heatmap(df_feat=None, y='subcategory', val_col='mean_dif', val_type='mean', normalize=False, figsize=(8, 5), title=None, title_kws=None, vmin=None, vmax=None, grid_on=True, cmap='RdBu_r', cmap_n_colors=None, dict_color=None, cbar_kws=None, facecolor_dark=False, add_jmd_tmd=True, tmd_len=20, start=1, jmd_n_seq=None, tmd_seq=None, jmd_c_seq=None, tmd_color='mediumspringgreen', jmd_color='blue', tmd_seq_color='black', jmd_seq_color='white', seq_size=None, tmd_jmd_fontsize=None, xticks_pos=False, xtick_size=11.0, xtick_width=2.0, xtick_length=5.0, ytick_size=None, add_legend_cat=True, legend_kws=None, add_importance_map=False, cbar_pct=False, **kwargs)[source]

    Plot a featuremap of the selected value column with scale information (y-axis) versus sequence position (x-axis).

    This is a wrapper function for seaborn.heatmap(), designed to highlight differences between two sets of sequences at the positional level (e.g., amino acid level for protein sequences).

    @@ -307,7 +307,7 @@

    aaanalysis.CPPPlot
    -update_seq_size()[source][source]
    +update_seq_size()[source]

    diff --git a/docs/build/html/generated/aaanalysis.SequenceFeature.html b/docs/build/html/generated/aaanalysis.SequenceFeature.html index aa9d3d69..b6c9cccb 100644 --- a/docs/build/html/generated/aaanalysis.SequenceFeature.html +++ b/docs/build/html/generated/aaanalysis.SequenceFeature.html @@ -110,7 +110,7 @@
  • - View page source + Edit on GitHub

  • @@ -122,7 +122,7 @@

    aaanalysis.SequenceFeature

    -class aaanalysis.SequenceFeature[source][source]
    +class aaanalysis.SequenceFeature[source]

    Bases: object

    Retrieve and create sequence feature components (Part, Split, and Scale).

    @@ -189,7 +189,7 @@

    aaanalysis.SequenceFeature
    -static get_df_parts(df_seq=None, list_parts=None, jmd_n_len=None, jmd_c_len=None, ext_len=4, all_parts=False)[source][source]
    +static get_df_parts(df_seq=None, list_parts=None, jmd_n_len=None, jmd_c_len=None, ext_len=4, all_parts=False)[source]

    Create DataFrane with sequence parts.

    Parameters
    @@ -235,7 +235,7 @@

    aaanalysis.SequenceFeature
    -static get_split_kws(n_split_min=1, n_split_max=15, steps_pattern=None, n_min=2, n_max=4, len_max=15, steps_periodicpattern=None, split_types=None)[source][source]
    +static get_split_kws(n_split_min=1, n_split_max=15, steps_pattern=None, n_min=2, n_max=4, len_max=15, steps_periodicpattern=None, split_types=None)[source]

    Create dictionary with kwargs for three split types: Segment, Pattern, PeriodicPattern

    Parameters
    @@ -282,7 +282,7 @@

    aaanalysis.SequenceFeature
    -get_features(list_parts=None, split_kws=None, df_scales=None, all_parts=False)[source][source]
    +get_features(list_parts=None, split_kws=None, df_scales=None, all_parts=False)[source]

    Create list of all feature ids for given Parts, Splits, and Scales

    Parameters
    @@ -304,7 +304,7 @@

    aaanalysis.SequenceFeature
    -static feat_matrix(features=None, df_parts=None, df_scales=None, accept_gaps=False, n_jobs=None, verbose=False, return_labels=False)[source][source]
    +static feat_matrix(features=None, df_parts=None, df_scales=None, accept_gaps=False, n_jobs=None, verbose=False, return_labels=False)[source]

    Create feature matrix for given feature ids and sequence parts.

    Parameters
    @@ -329,7 +329,7 @@

    aaanalysis.SequenceFeature
    -static feat_names(features=None, df_cat=None, tmd_len=20, jmd_c_len=10, jmd_n_len=10, ext_len=0, start=1)[source][source]
    +static feat_names(features=None, df_cat=None, tmd_len=20, jmd_c_len=10, jmd_n_len=10, ext_len=0, start=1)[source]

    Convert feature ids (PART-SPLIT-SCALE) into feature names (scale name [positions]).

    Parameters
    @@ -366,7 +366,7 @@

    aaanalysis.SequenceFeature
    -static add_feat_value(df_parts=None, split=None, dict_scale=None, accept_gaps=False)[source][source]
    +static add_feat_value(df_parts=None, split=None, dict_scale=None, accept_gaps=False)[source]

    Create feature values for all sequence parts by combining Part, Split, and Scale.

    Parameters
    @@ -415,7 +415,7 @@

    aaanalysis.SequenceFeature
    -static add_dif(df_feat=None, df_seq=None, labels=None, sample_name=<class 'str'>, ref_group=0, accept_gaps=False, jmd_n_len=10, jmd_c_len=10, df_parts=None, df_scales=None)[source][source]
    +static add_dif(df_feat=None, df_seq=None, labels=None, sample_name=<class 'str'>, ref_group=0, accept_gaps=False, jmd_n_len=10, jmd_c_len=10, df_parts=None, df_scales=None)[source]

    Add feature value difference between sample and reference group to DataFrame.

    Parameters
    @@ -439,7 +439,7 @@

    aaanalysis.SequenceFeature
    -static add_position(df_feat=None, features=None, start=1, tmd_len=20, jmd_n_len=10, jmd_c_len=10, ext_len=4, part_split=False)[source][source]
    +static add_position(df_feat=None, features=None, start=1, tmd_len=20, jmd_n_len=10, jmd_c_len=10, ext_len=4, part_split=False)[source]

    Create list with positions for given feature names

    Parameters
    diff --git a/docs/build/html/generated/aaanalysis.dPULearn.html b/docs/build/html/generated/aaanalysis.dPULearn.html index 1ea6e7d1..484b619d 100644 --- a/docs/build/html/generated/aaanalysis.dPULearn.html +++ b/docs/build/html/generated/aaanalysis.dPULearn.html @@ -107,7 +107,7 @@
  • - View page source + Edit on GitHub

  • @@ -119,7 +119,7 @@

    aaanalysis.dPULearn

    -class aaanalysis.dPULearn(verbose=False, n_components=0.8, pca_kwargs=None, metric=None)[source][source]
    +class aaanalysis.dPULearn(verbose=False, n_components=0.8, pca_kwargs=None, metric=None)[source]

    Bases: object

    Deterministic Positive-Unlabeled (dPULearn) model.

    dPULearn offers a deterministic approach for Positive-Unlabeled (PU) learning. The model primarily employs @@ -168,7 +168,7 @@

    aaanalysis.dPULearn
    -__init__(verbose=False, n_components=0.8, pca_kwargs=None, metric=None)[source][source]
    +__init__(verbose=False, n_components=0.8, pca_kwargs=None, metric=None)[source]

    Methods

    @@ -191,7 +191,7 @@

    aaanalysis.dPULearn
    -fit(X, labels=None, n_neg=0, label_pos=1, name_neg='REL_NEG', df_seq=None, col_class='class')[source][source]
    +fit(X, labels=None, n_neg=0, label_pos=1, name_neg='REL_NEG', df_seq=None, col_class='class')[source]

    Fit the dPULearn model to identify reliable negative samples from the provided feature matrix and labels.

    @@ -242,7 +242,7 @@

    aaanalysis.dPULearn
    -eval()[source][source]
    +eval()[source]

    diff --git a/docs/build/html/generated/aaanalysis.load_dataset.html b/docs/build/html/generated/aaanalysis.load_dataset.html index f0facf71..13c761c8 100644 --- a/docs/build/html/generated/aaanalysis.load_dataset.html +++ b/docs/build/html/generated/aaanalysis.load_dataset.html @@ -108,7 +108,7 @@
  • - View page source + Edit on GitHub

  • @@ -120,10 +120,10 @@

    aaanalysis.load_dataset

    -aaanalysis.load_dataset(name='INFO', n=None, non_canonical_aa='remove', min_len=None, max_len=None)[source][source]
    +aaanalysis.load_dataset(name='INFO', n=None, non_canonical_aa='remove', min_len=None, max_len=None)[source]

    Load protein benchmarking datasets.

    -

    The benchmarks are distinguished into amino acid (‘AA’), domain (‘DOM’), and sequence (‘SEQ’) level -datasets. Use default settings (name=’INFO’) of an overview table. Detailed analysis is in :cite:`Breimann23a`.

    +

    The benchmarks are categorized into amino acid (‘AA’), domain (‘DOM’), and sequence (‘SEQ’) level +datasets. Use default settings (name='INFO') for an overview table. Detailed analysis is in [Breimann23a].

    Parameters
      @@ -141,10 +141,10 @@

      aaanalysis.load_dataset

    Returns
    -

    Dataframe with the selected sequence dataset.

    +

    Dataframe (df_seq) containing the selected sequence dataset.

    Return type
    -

    df_seq

    +

    pd.DataFrame

    diff --git a/docs/build/html/generated/aaanalysis.load_scales.html b/docs/build/html/generated/aaanalysis.load_scales.html index 36cba888..38559bdf 100644 --- a/docs/build/html/generated/aaanalysis.load_scales.html +++ b/docs/build/html/generated/aaanalysis.load_scales.html @@ -108,7 +108,7 @@
  • - View page source + Edit on GitHub

  • @@ -120,7 +120,7 @@

    aaanalysis.load_scales

    -aaanalysis.load_scales(name='scales', just_aaindex=False, unclassified_in=True)[source][source]
    +aaanalysis.load_scales(name='scales', just_aaindex=False, unclassified_in=True)[source]

    Load amino acid scales, scale classification (AAontology), or scale evaluation.

    A through analysis of the residue and sequence datasets can be found in TODO[Breimann23a].

    diff --git a/docs/build/html/generated/aaanalysis.plot_gcfs.html b/docs/build/html/generated/aaanalysis.plot_gcfs.html index 3fb86912..97d7d229 100644 --- a/docs/build/html/generated/aaanalysis.plot_gcfs.html +++ b/docs/build/html/generated/aaanalysis.plot_gcfs.html @@ -111,7 +111,7 @@
  • - View page source + Edit on GitHub

  • @@ -123,7 +123,7 @@

    aaanalysis.plot_gcfs

    -aaanalysis.plot_gcfs()[source][source]
    +aaanalysis.plot_gcfs()[source]

    Get current font size, which is set by ut.plot_settings function

    diff --git a/docs/build/html/generated/aaanalysis.plot_get_cdict.html b/docs/build/html/generated/aaanalysis.plot_get_cdict.html index 383fcbe3..bfa23fd0 100644 --- a/docs/build/html/generated/aaanalysis.plot_get_cdict.html +++ b/docs/build/html/generated/aaanalysis.plot_get_cdict.html @@ -111,7 +111,7 @@
  • - View page source + Edit on GitHub

  • @@ -123,7 +123,7 @@

    aaanalysis.plot_get_cdict

    -aaanalysis.plot_get_cdict(name=None)[source][source]
    +aaanalysis.plot_get_cdict(name=None)[source]

    Retrieve color dictionaries specified for AAanalysis.

    Parameters
    diff --git a/docs/build/html/generated/aaanalysis.plot_get_cmap.html b/docs/build/html/generated/aaanalysis.plot_get_cmap.html index 53df4320..21b5fc87 100644 --- a/docs/build/html/generated/aaanalysis.plot_get_cmap.html +++ b/docs/build/html/generated/aaanalysis.plot_get_cmap.html @@ -111,7 +111,7 @@
  • - View page source + Edit on GitHub

  • @@ -123,7 +123,7 @@

    aaanalysis.plot_get_cmap

    -aaanalysis.plot_get_cmap(name=None, n_colors=100, facecolor_dark=False, color_pos=None, color_neg=None, color_center=None, input='hex', pct_gap=10, pct_center=None)[source][source]
    +aaanalysis.plot_get_cmap(name=None, n_colors=100, facecolor_dark=False, color_pos=None, color_neg=None, color_center=None, input='hex', pct_gap=10, pct_center=None)[source]

    Retrieve color maps or color dictionaries specified for AAanalysis.

    Parameters
    diff --git a/docs/build/html/generated/aaanalysis.plot_set_legend.html b/docs/build/html/generated/aaanalysis.plot_set_legend.html index 5eb952d8..a32cd14c 100644 --- a/docs/build/html/generated/aaanalysis.plot_set_legend.html +++ b/docs/build/html/generated/aaanalysis.plot_set_legend.html @@ -111,7 +111,7 @@
  • - View page source + Edit on GitHub

  • @@ -123,7 +123,7 @@

    aaanalysis.plot_set_legend

    -aaanalysis.plot_set_legend(ax=None, handles=None, dict_color=None, list_cat=None, labels=None, y=-0.2, x=0.5, ncol=3, fontsize=11, weight='normal', lw=0, edgecolor=None, return_handles=False, loc='upper left', labelspacing=0.2, columnspacing=1, title=None, fontsize_legend=None, title_align_left=True, fontsize_weight='normal', shape=None, **kwargs)[source][source]
    +aaanalysis.plot_set_legend(ax=None, handles=None, dict_color=None, list_cat=None, labels=None, y=-0.2, x=0.5, ncol=3, fontsize=11, weight='normal', lw=0, edgecolor=None, return_handles=False, loc='upper left', labelspacing=0.2, columnspacing=1, title=None, fontsize_legend=None, title_align_left=True, fontsize_weight='normal', shape=None, **kwargs)[source]

    Set a customizable legend for a plot.

    Parameters
    diff --git a/docs/build/html/generated/aaanalysis.plot_settings.html b/docs/build/html/generated/aaanalysis.plot_settings.html index fa5a60f3..8d2b600b 100644 --- a/docs/build/html/generated/aaanalysis.plot_settings.html +++ b/docs/build/html/generated/aaanalysis.plot_settings.html @@ -111,7 +111,7 @@
  • - View page source + Edit on GitHub

  • @@ -123,7 +123,7 @@

    aaanalysis.plot_settings

    -aaanalysis.plot_settings(fig_format='pdf', verbose=False, grid=False, grid_axis='y', font_scale=0.7, font='Arial', change_size=True, weight_bold=True, adjust_elements=True, short_ticks=False, no_ticks=False, no_ticks_y=False, short_ticks_y=False, no_ticks_x=False, short_ticks_x=False)[source][source]
    +aaanalysis.plot_settings(fig_format='pdf', verbose=False, grid=False, grid_axis='y', font_scale=0.7, font='Arial', change_size=True, weight_bold=True, adjust_elements=True, short_ticks=False, no_ticks=False, no_ticks_y=False, short_ticks_y=False, no_ticks_x=False, short_ticks_x=False)[source]

    Configure general settings for plot visualization with various customization options.

    Parameters
    diff --git a/docs/build/html/genindex.html b/docs/build/html/genindex.html index a969bf1e..7310caee 100644 --- a/docs/build/html/genindex.html +++ b/docs/build/html/genindex.html @@ -84,6 +84,7 @@
  • + Edit on GitHub

  • diff --git a/docs/build/html/index.html b/docs/build/html/index.html index 5e8dfb81..ee496608 100644 --- a/docs/build/html/index.html +++ b/docs/build/html/index.html @@ -96,7 +96,7 @@
  • - View page source + Edit on GitHub

  • diff --git a/docs/build/html/index/CONTRIBUTING_COPY.html b/docs/build/html/index/CONTRIBUTING_COPY.html index b9323b76..352b8451 100644 --- a/docs/build/html/index/CONTRIBUTING_COPY.html +++ b/docs/build/html/index/CONTRIBUTING_COPY.html @@ -123,7 +123,7 @@
  • - View page source + Edit on GitHub

  • diff --git a/docs/build/html/index/citations.html b/docs/build/html/index/citations.html index 6089e459..7d784b94 100644 --- a/docs/build/html/index/citations.html +++ b/docs/build/html/index/citations.html @@ -95,7 +95,7 @@
  • - View page source + Edit on GitHub

  • diff --git a/docs/build/html/index/introduction.html b/docs/build/html/index/introduction.html index 604e8200..ac26421c 100644 --- a/docs/build/html/index/introduction.html +++ b/docs/build/html/index/introduction.html @@ -100,7 +100,7 @@
  • - View page source + Edit on GitHub

  • diff --git a/docs/build/html/index/references.html b/docs/build/html/index/references.html index fd44f572..f9037e9d 100644 --- a/docs/build/html/index/references.html +++ b/docs/build/html/index/references.html @@ -102,7 +102,7 @@
  • - View page source + Edit on GitHub

  • diff --git a/docs/build/html/index/tables_template.html b/docs/build/html/index/tables_template.html index 84299ac3..641ebde2 100644 --- a/docs/build/html/index/tables_template.html +++ b/docs/build/html/index/tables_template.html @@ -95,7 +95,7 @@
  • - View page source + Edit on GitHub

  • diff --git a/docs/build/html/index/usage_principles.html b/docs/build/html/index/usage_principles.html index 95eaee2c..18b57c35 100644 --- a/docs/build/html/index/usage_principles.html +++ b/docs/build/html/index/usage_principles.html @@ -97,7 +97,7 @@
  • - View page source + Edit on GitHub

  • diff --git a/docs/build/html/searchindex.js b/docs/build/html/searchindex.js index c01f366a..789551af 100644 --- a/docs/build/html/searchindex.js +++ b/docs/build/html/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["_index/badges", "_index/overview", "_index/tables", "_index/usage_principles/data_loading", "api", "generated/aaanalysis.AAclust", "generated/aaanalysis.CPP", "generated/aaanalysis.CPPPlot", "generated/aaanalysis.SequenceFeature", "generated/aaanalysis.dPULearn", "generated/aaanalysis.load_dataset", "generated/aaanalysis.load_scales", "generated/aaanalysis.plot_gcfs", "generated/aaanalysis.plot_get_cdict", "generated/aaanalysis.plot_get_cmap", "generated/aaanalysis.plot_set_legend", "generated/aaanalysis.plot_settings", "index", "index/CONTRIBUTING_COPY", "index/citations", "index/introduction", "index/references", "index/tables_template", "index/usage_principles", "tutorials"], "filenames": ["_index/badges.rst", "_index/overview.rst", "_index/tables.rst", "_index/usage_principles/data_loading.rst", "api.rst", "generated/aaanalysis.AAclust.rst", "generated/aaanalysis.CPP.rst", "generated/aaanalysis.CPPPlot.rst", "generated/aaanalysis.SequenceFeature.rst", "generated/aaanalysis.dPULearn.rst", "generated/aaanalysis.load_dataset.rst", "generated/aaanalysis.load_scales.rst", "generated/aaanalysis.plot_gcfs.rst", "generated/aaanalysis.plot_get_cdict.rst", "generated/aaanalysis.plot_get_cmap.rst", "generated/aaanalysis.plot_set_legend.rst", "generated/aaanalysis.plot_settings.rst", "index.rst", "index/CONTRIBUTING_COPY.rst", "index/citations.rst", "index/introduction.rst", "index/references.rst", "index/tables_template.rst", "index/usage_principles.rst", "tutorials.rst"], "titles": ["<no title>", "<no title>", "Tables", "Data Loading", "API", "aaanalysis.AAclust", "aaanalysis.CPP", "aaanalysis.CPPPlot", "aaanalysis.SequenceFeature", "aaanalysis.dPULearn", "aaanalysis.load_dataset", "aaanalysis.load_scales", "aaanalysis.plot_gcfs", "aaanalysis.plot_get_cdict", "aaanalysis.plot_get_cmap", "aaanalysis.plot_set_legend", "aaanalysis.plot_settings", "Welcome to the AAanalysis documentation", "Contributing", "<no title>", "Introduction", "References", "Tables", "Usage Principles", "Tutorials"], "terms": {"aaanalysi": [1, 4, 18, 19, 20, 22, 23], "amino": [1, 3, 5, 6, 7, 8, 10, 11, 17, 19, 20, 21, 22], "acid": [1, 3, 5, 6, 7, 8, 10, 11, 17, 19, 20, 21, 22], "analysi": [1, 9, 10, 11, 17, 18, 20], "i": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 17, 18, 20], "python": [1, 17, 18, 20], "framework": [1, 5, 17, 20], "interpret": [1, 14, 17, 18, 19, 20, 21], "sequenc": [1, 2, 3, 6, 7, 8, 9, 10, 11, 17, 18, 20, 21], "base": [1, 5, 6, 7, 8, 9, 14, 17, 18, 20, 21], "protein": [1, 7, 8, 10, 17, 18, 20, 21], "predict": [1, 2, 3, 17, 18, 20, 21], "provid": [1, 3, 5, 6, 7, 9, 11, 14, 17, 18], "follow": [1, 3, 4, 5, 6, 8, 9, 17, 18, 19, 20], "algorithm": [1, 6, 7, 17, 18, 20], "aaclust": [1, 17, 19, 20, 21, 24], "k": [1, 5, 17, 20, 21], "optim": [1, 5, 6, 7, 17, 20, 21], "cluster": [1, 5, 17, 20, 21], "wrapper": [1, 7, 17, 18, 20], "select": [1, 5, 6, 7, 10, 11, 17, 20, 21], "redund": [1, 5, 6, 17, 20, 21], "reduc": [1, 5, 9, 17, 20, 21], "set": [1, 2, 5, 6, 7, 8, 9, 10, 12, 15, 16, 17, 18, 20, 21], "numer": [1, 5, 7, 8, 17, 20], "scale": [1, 5, 6, 7, 8, 11, 13, 14, 16, 17, 19, 20, 21, 22], "e": [1, 3, 7, 8, 13, 14, 16, 17, 18, 20], "g": [1, 3, 7, 8, 13, 14, 16, 17, 18, 20], "cpp": [1, 7, 8, 14, 17, 19, 20, 24], "compar": [1, 17, 20], "physicochem": [1, 6, 8, 17, 20, 21], "profil": [1, 7, 13, 14, 17, 20], "featur": [1, 5, 6, 7, 8, 9, 14, 17, 18, 20], "engin": [1, 17, 18, 20], "two": [1, 6, 7, 17, 18, 20, 21], "identifi": [1, 6, 7, 9, 17, 20, 21], "most": [1, 5, 6, 7, 9, 17, 20], "distinct": [1, 8, 17, 20], "dpulearn": [1, 17, 19, 20, 24], "determinist": [1, 9, 17, 20], "posit": [1, 2, 3, 5, 6, 7, 8, 9, 14, 17, 20], "unlabel": [1, 3, 9, 17, 20], "pu": [1, 2, 3, 9, 17, 20], "learn": [1, 3, 5, 9, 17, 18, 19, 20, 21], "enabl": [1, 5, 6, 7, 8, 9, 16, 17, 18, 20], "train": [1, 17, 18, 20], "unbalanc": [1, 17, 18, 20], "small": [1, 9, 17, 18, 20], "dataset": [1, 3, 5, 6, 10, 11, 17, 18, 20], "moreov": [1, 17], "function": [1, 4, 7, 12, 14, 16, 17], "load": [1, 10, 11, 17, 18], "benchmark": [1, 3, 10, 17], "load_data": [1, 17], "load_scal": [1, 2, 8, 17, 22], "depth": [1, 17], "level": [1, 2, 7, 10, 17], "classif": [1, 2, 3, 7, 11, 17], "aaontologi": [1, 2, 11, 17, 19, 21], "descript": [2, 18, 22], "see": [2, 3, 7, 10, 18, 22], "also": [2, 3, 9, 18, 22], "1_overview_benchmark": 2, "aa": [2, 3, 4, 6, 8, 9, 10, 15, 16, 22, 23], "load_dataset": [2, 4, 8, 22], "2_overview_scal": 2, "neg": [2, 5, 8, 9, 14], "predictor": 2, "refer": [2, 4, 5, 6, 8], "label": [2, 5, 6, 7, 8, 9, 15, 18], "aa_caspase3": 2, "233": 2, "185605": 2, "705": 2, "184900": 2, "prosper": [2, 21], "caspas": 2, "3": [2, 5, 8, 9, 15, 18], "cleavag": [2, 21], "site": [2, 21], "song18": [2, 21], "1": [2, 3, 5, 6, 7, 8, 9, 15, 16], "adjac": 2, "0": [2, 5, 6, 7, 8, 9, 15, 16], "aa_furin": 2, "71": 2, "59003": 2, "163": 2, "58840": 2, "furin": 2, "aa_ldr": [2, 3], "342": 2, "118248": 2, "35469": 2, "82779": 2, "idp": [2, 21], "seq2seq": [2, 21], "long": 2, "intrins": [2, 21], "disord": [2, 21], "region": [2, 21], "ldr": 2, "tang20": [2, 21], "order": [2, 5, 22], "aa_mmp2": 2, "573": 2, "312976": 2, "2416": 2, "310560": 2, "matrix": [2, 5, 8, 9], "metallopeptidas": 2, "2": [2, 3, 5, 6, 7, 8, 9, 15], "mmp2": 2, "aa_rnabind": 2, "221": 2, "55001": 2, "6492": 2, "48509": 2, "gmksvm": 2, "ru": 2, "rna": [2, 21], "bind": [2, 21], "residu": [2, 3, 11, 21], "rbp60": 2, "yang21": [2, 21], "non": [2, 6, 8, 10], "aa_sa": 2, "101082": 2, "84523": 2, "solvent": 2, "access": [2, 4], "sa": 2, "data": [2, 5, 7, 9, 17, 18], "expos": 2, "buri": 2, "seq_amylo": [2, 3], "1414": 2, "8484": 2, "511": 2, "903": 2, "rerf": [2, 21], "pred": [2, 21], "amyloidognen": 2, "teng21": [2, 21], "amyloidogen": [2, 21], "seq_capsid": 2, "7935": 2, "3364680": 2, "3864": 2, "4071": 2, "viralpro": [2, 21], "capdsid": 2, "galiez16": [2, 21], "capsid": [2, 21], "seq_disulfid": 2, "2547": 2, "614470": 2, "897": 2, "1650": 2, "dipro": 2, "disulfid": 2, "bridg": [2, 21], "cheng06": [2, 21], "ss": 2, "bond": 2, "without": [2, 5, 7, 18], "seq_loc": 2, "1835": 2, "732398": 2, "1045": 2, "790": 2, "nan": 2, "subcellular": [2, 21], "locat": [2, 15], "cytoplasm": 2, "v": 2, "plasma": 2, "membran": [2, 8], "shen19": [2, 21], "seq_solubl": 2, "17408": 2, "4432269": 2, "8704": 2, "solpro": [2, 21], "solubl": [2, 21], "insolubl": 2, "magnan09": [2, 21], "seq_tail": 2, "6668": 2, "2671690": 2, "2574": 2, "4094": 2, "tail": [2, 21], "domain": [2, 3, 8, 10], "dom_gsec": [2, 3], "126": 2, "92964": 2, "63": 2, "gamma": 2, "secretas": [2, 21], "substrat": [2, 21], "breimann23c": [2, 21], "dom_gsec_pu": [2, 3], "694": 2, "494524": 2, "unknown": 2, "statu": 2, "min": 2, "max": 2, "normal": [2, 7, 15], "586": 2, "breimann23b": [2, 17, 19, 21], "scales_raw": [2, 11], "raw": 2, "valu": [2, 5, 6, 7, 8, 18, 20], "kawashima08": [2, 21], "scales_classif": 2, "scales_pc": [2, 11], "princip": [2, 9], "compon": [2, 8, 9], "pc": [2, 9], "compress": 2, "20": [2, 6, 7, 8, 18], "breimann23a": [2, 10, 11, 21], "top60": [2, 11], "top": [2, 5], "60": 2, "subset": [2, 8], "top60_ev": [2, 11], "evalu": [2, 6, 11, 18], "three": [3, 5, 8, 14], "type": [3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 18], "ar": [3, 6, 7, 8, 9, 10, 11, 18, 22], "us": [3, 5, 6, 7, 9, 10, 14, 16, 17, 18, 19, 20], "specif": [3, 5, 13], "properti": [3, 8], "dom": [3, 10], "seq": [3, 10], "The": [3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 18], "each": [3, 5, 6, 7, 8, 9], "indic": [3, 5, 7, 8, 9], "first": [3, 6, 7, 8, 14, 18], "part": [3, 6, 7, 8, 18], "name": [3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16], "an": [3, 9, 10, 17, 18, 19, 21], "abbrevi": 3, "For": [3, 4, 5, 8, 15, 18], "some": 3, "addit": [3, 7, 8, 9, 15, 16], "version": 3, "contain": [3, 4, 6, 7, 9, 10, 18], "onli": [3, 7, 11, 18], "sampl": [3, 5, 6, 7, 8, 9], "dataset_nam": 3, "_pu": 3, "thi": [4, 5, 7, 16, 18], "page": [4, 17], "public": [4, 17, 18, 19], "object": [4, 5, 7, 8, 9], "more": [4, 7, 18], "exampl": [4, 18], "practic": 4, "usag": [4, 17], "our": [4, 18], "notebook": [4, 24], "conveni": 4, "common": [4, 18], "import": [4, 8, 9, 15, 16, 18, 23], "modul": [4, 5, 17], "Then": 4, "you": [4, 17, 18, 19], "can": [4, 5, 8, 9, 11, 15, 17, 18, 20], "all": [4, 5, 6, 7, 8, 16, 18, 22], "method": [4, 5, 6, 7, 8, 9, 21], "via": [4, 18, 21], "alia": [4, 8], "class": [5, 6, 7, 8, 9, 10], "model": [5, 9, 18], "none": [5, 6, 7, 8, 9, 10, 13, 14, 15], "model_kwarg": 5, "verbos": [5, 6, 7, 8, 9, 16], "fals": [5, 6, 7, 8, 9, 11, 14, 15, 16], "sourc": [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18], "A": [5, 8, 11, 15, 18, 20, 21], "design": [5, 7, 18], "primarili": [5, 9, 18], "versatil": 5, "enough": 5, "ani": [5, 18, 20], "It": [5, 20], "take": 5, "requir": 5, "pre": [5, 6, 18], "defin": [5, 8, 18], "number": [5, 6, 7, 8, 9, 10, 14, 15], "from": [5, 6, 7, 8, 9, 11, 17, 18, 22], "scikit": [5, 18], "http": [5, 18], "org": [5, 18], "stabl": 5, "html": [5, 18], "By": 5, "leverag": 5, "pearson": [5, 6], "correl": [5, 6], "similar": 5, "measur": [5, 18], "one": [5, 7], "repres": [5, 7, 20], "term": 5, "medoid": 5, "which": [5, 7, 8, 12, 20], "closest": 5, "": [5, 15, 21], "center": [5, 14], "yield": 5, "paramet": [5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16], "callabl": 5, "option": [5, 6, 7, 8, 9, 10, 11, 14, 16], "default": [5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16], "sklearn": 5, "kmean": 5, "emploi": [5, 9], "given": [5, 7, 8, 11, 22], "n_cluster": 5, "dict": [5, 6, 7, 8, 9, 13, 14, 15], "dictionari": [5, 6, 7, 8, 13, 14, 15], "keyword": [5, 7, 9], "argument": [5, 7, 8, 9, 15], "pass": [5, 7, 9, 15, 18], "bool": [5, 6, 7, 8, 9, 11, 14, 15, 16], "flag": 5, "disabl": [5, 10], "output": [5, 8, 9, 16], "obtain": [5, 8], "int": [5, 6, 7, 8, 9, 10, 14, 15], "labels_": [5, 9], "arrai": [5, 6, 8, 9], "like": [5, 6, 8, 9, 18], "centers_": 5, "averag": [5, 8], "correspond": [5, 18], "center_labels_": 5, "medoids_": 5, "medoid_labels_": 5, "medoid_ind_": 5, "chosen": [5, 6, 8], "within": [5, 6, 8], "origin": 5, "__init__": [5, 6, 7, 8, 9], "fit": [5, 9, 18], "x": [5, 7, 9, 10, 15, 16], "on_cent": 5, "true": [5, 6, 7, 8, 11, 15, 16], "min_th": 5, "merge_metr": 5, "euclidean": [5, 9], "format": [5, 16], "determin": 5, "partit": 5, "maxim": 5, "beyond": 5, "threshold": [5, 6], "qualiti": 5, "either": [5, 8, 17], "minimum": [5, 8, 10], "member": 5, "min_cor": 5, "between": [5, 6, 7, 8, 14, 15], "its": [5, 18], "govern": 5, "undergo": 5, "stage": 5, "estim": 5, "lower": 5, "bound": 5, "refin": 5, "metric": [5, 9, 18], "merg": 5, "smaller": 5, "direct": 5, "final": 5, "reduct": 5, "shape": [5, 6, 7, 8, 9, 15], "n_sampl": [5, 6, 8, 9], "n_featur": [5, 6, 7, 8, 9], "where": [5, 8, 9], "list": [5, 7, 8, 14, 15], "str": [5, 7, 8, 9, 10, 11, 13, 14, 15, 16], "If": [5, 6, 7, 8, 9, 10, 14, 16, 17, 18, 19], "return": [5, 6, 7, 8, 9, 10, 11, 13, 14, 15], "appli": [5, 9, 14, 15, 16], "otherwis": [5, 7, 8, 9], "float": [5, 6, 7, 9, 14, 15, 16], "instead": 5, "names_medoid": 5, "attribut": 5, "attr": 5, "further": [5, 7, 10, 18], "inform": [5, 6, 7, 8, 9, 10], "paper": 5, "todo": [5, 6, 11], "add": [5, 6, 7, 8], "link": [5, 6, 17, 18, 19, 21], "cluster_nam": 5, "name_unclassifi": 5, "unclassifi": [5, 11], "assign": [5, 7, 8, 9], "frequenc": 5, "renam": 5, "prioriti": 5, "frequent": 5, "alreadi": 5, "doe": 5, "exist": [5, 18], "cannot": 5, "classifi": [5, 7], "static": [5, 6, 8], "get_cluster_cent": 5, "comput": [5, 6, 7, 8, 18, 21], "center_label": 5, "associ": 5, "get_cluster_medoid": 5, "medoid_label": 5, "medoid_ind": 5, "index": [5, 17, 18, 21], "x_test": 5, "x_ref": 5, "labels_test": 5, "labels_ref": 5, "n": [5, 6, 7, 8, 10, 18, 21], "except_unclassifi": 5, "test": [5, 6], "consid": [5, 18], "strength": 5, "els": 5, "exclud": 5, "list_top_center_name_corr": 5, "have": [5, 18], "strongest": 5, "eval": [5, 6, 9, 18], "df_scale": [6, 8], "df_cat": [6, 7, 8], "df_part": [6, 8], "split_kw": [6, 8], "accept_gap": [6, 7, 8], "tool": [6, 18, 21], "creat": [6, 7, 8, 9, 18], "filter": [6, 7, 10], "discrimin": [6, 7], "panda": [6, 7, 8, 9, 11, 18], "datafram": [6, 7, 8, 9, 10, 11, 18], "load_categori": [6, 8], "categori": [6, 7, 8, 11, 13, 14, 15], "sequencefeatur": 6, "get_split_kw": [6, 8], "nest": [6, 8], "split_typ": [6, 8], "whether": [6, 7, 8, 11, 14, 15], "accept": [6, 7, 8], "miss": [6, 7, 8], "omit": [6, 7, 8], "print": [6, 7, 8], "progress": [6, 7, 21], "about": [6, 7], "run": [6, 8], "perform": [6, 9], "step": [6, 7, 8, 20], "parametr": 6, "n_filter": 6, "100": [6, 14], "tmd_len": [6, 7, 8], "jmd_n_len": [6, 7, 8], "10": [6, 7, 8, 14], "jmd_c_len": [6, 7, 8], "ext_len": [6, 7, 8], "4": [6, 7, 8], "start": [6, 7, 8, 18], "check_cat": 6, "n_pre_filt": 6, "pct_pre_filt": 6, "5": [6, 7, 8, 9, 15], "max_std_test": 6, "max_overlap": 6, "max_cor": 6, "n_process": 6, "pipelin": [6, 18], "creation": 6, "aim": [6, 7, 18], "collect": 6, "group": [6, 7, 8], "t": 6, "u": [6, 17, 18], "p": [6, 21], "percentag": [6, 9, 14], "length": [6, 7, 8, 10], "tmd": [6, 7, 8], "explan": [6, 7], "terminu": [6, 7, 8], "jmd": [6, 7, 8], "c": [6, 7, 8, 17, 21], "extend": [6, 7, 8, 18], "termin": [6, 7, 8], "should": [6, 7, 8, 9, 11, 18], "longer": 6, "than": 6, "check": [6, 18], "remain": [6, 18], "after": 6, "maximum": [6, 8, 9, 10], "standard": 6, "deviat": 6, "overlap": 6, "cpu": 6, "multiprocess": 6, "automat": [6, 7, 9, 18], "df_feat": [6, 7, 8], "uniqu": [6, 7], "statist": [6, 7], "n_feature_inform": [6, 7], "eleven": 6, "column": [6, 7, 8, 9, 10, 15, 18], "includ": [6, 8, 11, 14, 15, 18], "id": [6, 8], "result": 6, "rank": 6, "11": [6, 7, 15], "split": [6, 8], "subcategori": [6, 7], "sub": 6, "scale_nam": [6, 7], "abs_auc": [6, 7], "absolut": 6, "adjust": [6, 7, 16], "auc": 6, "abs_mean_dif": 6, "mean": [6, 7], "differ": [6, 7, 8, 15], "std_test": [6, 7], "std_ref": 6, "p_val": 6, "mann_whitnei": 6, "ttest_indep": 6, "p_val_fdr_bh": 6, "benjamini": 6, "hochberg": 6, "fdr": 6, "correct": 6, "get": [6, 8, 12], "condit": [7, 8], "jmd_m_len": [7, 8], "y": [7, 15, 16], "val_col": 7, "mean_dif": 7, "val_typ": 7, "count": [7, 11], "figsiz": 7, "7": [7, 8, 9, 16], "titl": [7, 15], "title_kw": 7, "dict_color": [7, 13, 14, 15], "edge_color": 7, "bar_width": 7, "75": 7, "add_jmd_tmd": 7, "jmd_n_seq": 7, "tmd_seq": 7, "jmd_c_seq": 7, "tmd_color": 7, "mediumspringgreen": 7, "jmd_color": 7, "blue": [7, 15], "tmd_seq_color": 7, "black": [7, 18], "jmd_seq_color": 7, "white": 7, "seq_siz": 7, "tmd_jmd_fontsiz": 7, "xtick_siz": 7, "xtick_width": 7, "xtick_length": 7, "xticks_po": 7, "ytick_siz": 7, "ytick_width": 7, "ytick_length": 7, "ylim": 7, "highlight_tmd_area": 7, "highlight_alpha": 7, "15": [7, 8], "grid": [7, 16], "grid_axi": [7, 16], "both": [7, 16], "add_legend_cat": 7, "legend_kw": 7, "shap_plot": 7, "kwarg": [7, 8, 15], "plot": [7, 13, 14, 15, 16, 17, 18], "instanc": 7, "avail": [7, 17, 19, 21], "specifi": [7, 8, 9, 13, 14, 16, 18], "check_value_typ": 7, "tupl": [7, 14], "size": [7, 8, 12, 14, 15, 16], "custom": [7, 15, 16], "appear": [7, 16], "map": [7, 8, 14, 15], "color": [7, 13, 14, 15], "edg": [7, 15, 18], "bar": [7, 13, 14], "width": [7, 15], "line": [7, 15], "annot": 7, "font": [7, 12, 15, 16], "tick": [7, 16], "axi": [7, 16], "limit": 7, "highlight": 7, "area": 7, "alpha": 7, "ad": 7, "drawn": 7, "legend": [7, 15], "shap": [7, 14, 18], "shaplei": 7, "gener": [7, 8, 14, 16, 18, 20, 21], "other": [7, 11, 18], "intern": 7, "librari": [7, 16, 18], "ax": [7, 15], "matplotlib": [7, 15, 16, 18], "heatmap": [7, 13, 14], "8": [7, 8, 9, 18], "vmin": 7, "vmax": 7, "grid_on": 7, "cmap": [7, 13, 14], "rdbu_r": 7, "cmap_n_color": 7, "cbar_kw": 7, "facecolor_dark": [7, 14], "add_importance_map": 7, "cbar_pct": 7, "featuremap": 7, "versu": 7, "seaborn": [7, 14, 16, 18], "shown": 7, "feat_impact": 7, "displai": 7, "sum": 7, "std": 7, "aggreg": 7, "positions_onli": 7, "across": 7, "recommend": [7, 9, 18], "when": [7, 9], "emphas": [7, 18], "fewer": 7, "value_typ": 7, "height": 7, "figur": 7, "inch": 7, "pyplot": [7, 15], "anchor": [7, 15], "colormap": 7, "infer": [7, 18], "seismic": 7, "space": [7, 9, 14, 15], "impact": 7, "discret": 7, "diverg": 7, "sequenti": 7, "kei": [7, 18], "colorbar": 7, "under": [7, 18], "depicet": 7, "depict": 7, "jmd_n": [7, 8], "jmd_c": [7, 8], "point": [7, 15], "set_xticklabel": 7, "widht": 7, "tick_param": 7, "pcolormesh": 7, "effect": [7, 18], "align": [7, 15], "applic": 7, "document": [7, 22], "detail": [7, 10, 15, 17, 18, 19], "code": [7, 14], "update_seq_s": 7, "retriev": [8, 13, 14], "continu": 8, "transmembran": 8, "principl": [8, 17], "segment": 8, "pattern": 8, "express": 8, "present": 8, "realiz": 8, "over": 8, "valid": [8, 18], "tmd_e": 8, "tmd_n": 8, "tmd_c": 8, "ext_c": 8, "ext_n": 8, "tmd_jmd": 8, "jmd_n_tmd_n": 8, "tmd_c_jmd_c": 8, "ext_n_tmd_n": 8, "tmd_c_ext_c": 8, "get_df_part": 8, "df_seq": [8, 9, 10], "list_part": 8, "all_part": 8, "datafran": 8, "compris": 8, "tmd_start": 8, "tmd_stop": 8, "string": [8, 14], "len": 8, "must": 8, "lenght": 8, "resp": 8, "extra": 8, "possibl": 8, "found": [8, 11, 18], "sf": 8, "gsec_sub_seq": 8, "n_split_min": 8, "n_split_max": 8, "steps_pattern": 8, "n_min": 8, "n_max": 8, "len_max": 8, "steps_periodicpattern": 8, "periodicpattern": 8, "greater": 8, "greatest": 8, "whole": [8, 10], "specfii": 8, "smallest": 8, "integ": 8, "6": 8, "vari": 8, "paramt": 8, "argumetn": 8, "get_featur": 8, "combin": [8, 18], "form": 8, "feat_matrix": 8, "n_job": 8, "return_label": 8, "pd": [8, 9, 18], "seri": 8, "job": 8, "parallel": 8, "spars": 8, "feat_nam": 8, "convert": 8, "depend": 8, "last": 8, "step1": 8, "step2": 8, "add_feat_valu": 8, "dict_scal": 8, "convent": 8, "letter": 8, "feature_valu": 8, "n_part": 8, "ha": [8, 18], "structur": [8, 21], "th": 8, "n_split": 8, "p1": 8, "p2": 8, "pn": 8, "end": 8, "odd": 8, "even": 8, "give": 8, "add_dif": 8, "sample_nam": 8, "ref_group": 8, "add_posit": 8, "part_split": 8, "feat_posit": 8, "total": [8, 9], "n_compon": 9, "pca_kwarg": 9, "offer": [9, 18], "approach": 9, "pca": 9, "dimension": [9, 21], "iter": 9, "reliabl": [9, 18], "These": [9, 18], "those": 9, "distant": 9, "altern": 9, "distanc": 9, "manhattan": 9, "cosin": 9, "80": 9, "cover": 9, "varianc": 9, "identif": [9, 21], "datapoint": 9, "inspir": [9, 18], "techniqu": 9, "theoret": 9, "high": [9, 21], "n_neg": 9, "label_po": 9, "name_neg": 9, "rel_neg": 9, "col_class": 9, "newli": 9, "updat": [9, 18], "new": [9, 18], "store": 9, "Will": 9, "dure": 9, "initi": 9, "datafor": 9, "conta": 9, "po": 9, "unl": 9, "numpi": [9, 18], "np": 9, "atgc": 9, "gcta": 9, "actg": 9, "tacg": 9, "mode": 9, "modifi": [9, 10, 16], "dpul": 9, "info": 10, "non_canonical_aa": 10, "remov": [10, 16], "min_len": 10, "max_len": 10, "distinguish": 10, "overview": 10, "tabl": 10, "cite": [10, 17, 19], "per": 10, "liter": 10, "keep": 10, "gap": [10, 14], "canon": 10, "do": 10, "replac": 10, "symbol": 10, "ref": 10, "just_aaindex": 11, "unclassified_in": 11, "through": 11, "scale_cat": 11, "relev": 11, "scale_classif": 11, "aaindex": [11, 21], "df": 11, "current": 12, "ut": 12, "plot_set": 12, "dict_scale_cat": [13, 14], "cppplot": [13, 14, 18], "respect": [13, 14, 17, 18, 19], "n_color": 14, "color_po": 14, "color_neg": 14, "color_cent": 14, "input": [14, 18], "hex": 14, "pct_gap": 14, "pct_center": 14, "palett": 14, "feat": 14, "ggplot": 14, "datagroup": 14, "dark": 14, "face": 14, "rgb": 14, "hl": 14, "husl": 14, "xkcd": 14, "latter": 14, "rang": 14, "sn": 14, "color_palett": 14, "light_palett": 14, "lighter": 14, "handl": 15, "list_cat": 15, "ncol": 15, "fontsiz": 15, "weight": [15, 21], "lw": 15, "edgecolor": 15, "return_handl": 15, "loc": 15, "upper": 15, "left": 15, "labelspac": 15, "columnspac": 15, "fontsize_legend": 15, "title_align_left": 15, "fontsize_weight": 15, "customiz": 15, "attach": 15, "item": 15, "coordin": 15, "text": [15, 16], "vertic": 15, "horizont": 15, "marker": 15, "directli": [15, 18], "finer": 15, "control": 15, "how": 15, "line2d": 15, "cat1": 15, "red": 15, "cat2": 15, "o": 15, "fig_format": 16, "pdf": 16, "font_scal": 16, "arial": 16, "change_s": 16, "weight_bold": 16, "adjust_el": 16, "short_tick": 16, "no_tick": 16, "no_ticks_i": 16, "short_ticks_i": 16, "no_ticks_x": 16, "short_ticks_x": 16, "configur": 16, "visual": [16, 18], "variou": [16, 18], "file": [16, 18], "save": 16, "make": [16, 18], "visibl": 16, "choos": 16, "san": 16, "serif": 16, "verdana": 16, "helvetica": 16, "dejavu": 16, "element": 16, "bold": 16, "layout": 16, "short": 16, "mark": 16, "global": 16, "pypi": 17, "conda": [17, 18], "forg": 17, "pip": [17, 18], "introduct": 17, "contribut": 17, "tutori": 17, "api": 17, "explain": [17, 18, 21], "ai": [17, 18, 21], "perturb": 17, "util": [17, 18], "search": 17, "your": [17, 18, 19], "work": [17, 19], "pleas": [17, 18, 19], "_": [17, 19], "breimann": [17, 19, 21], "kamp": [17, 19], "steiner": [17, 19], "frishman": [17, 19], "2023": [17, 19], "ontologi": [17, 19, 21], "machin": [17, 18, 19, 21], "biorxiv": [17, 19, 21], "welcom": 18, "thank": 18, "we": 18, "open": 18, "project": [18, 22], "focus": 18, "involv": 18, "invalu": 18, "made": 18, "wai": 18, "suggest": 18, "github": 18, "issu": 18, "tracker": 18, "submit": 18, "improv": [18, 21], "particip": 18, "discuss": 18, "newcom": 18, "tackl": 18, "good": 18, "email": 18, "stephanbreimann": 18, "gmail": 18, "com": 18, "question": 18, "establish": 18, "comprehens": 18, "toolkit": 18, "robust": 18, "life": 18, "scienc": 18, "integr": [18, 21], "seamlessli": 18, "flexibl": 18, "interoper": 18, "packag": 18, "biopython": 18, "reimplement": 18, "solut": 18, "ignor": 18, "biolog": [18, 20], "context": 18, "relianc": 18, "opaqu": 18, "box": 18, "empir": 18, "insight": 18, "cut": 18, "fair": 18, "account": 18, "transpar": 18, "re": [18, 21], "commit": 18, "divers": 18, "aspect": 18, "causal": 18, "minim": 18, "reproduc": 18, "mre": 18, "least": 18, "amount": 18, "demonstr": 18, "self": 18, "ensur": 18, "necessari": 18, "confirm": 18, "replic": 18, "guidelin": 18, "here": [18, 22], "To": 18, "git": 18, "breimanntool": 18, "master": 18, "repositori": 18, "your_usernam": 18, "navig": 18, "folder": 18, "up": 18, "cd": 18, "isol": 18, "aanalysi": 18, "9": 18, "activ": 18, "poetri": 18, "pytest": 18, "hypothesi": 18, "execut": 18, "case": 18, "directori": 18, "substanti": 18, "minor": 18, "typo": 18, "concis": 18, "clear": 18, "branch": 18, "fix": 18, "readm": 18, "date": 18, "readthedoc": 18, "crucial": 18, "modif": 18, "thei": 18, "render": 18, "correctli": 18, "strive": 18, "consist": [18, 20], "interfac": 18, "well": 18, "organ": 18, "codebas": 18, "standalon": 18, "focu": 18, "special": 18, "task": 18, "carri": 18, "out": 18, "complet": 18, "process": 18, "fulfil": 18, "purpos": 18, "being": 18, "implement": 18, "inherit": 18, "supplementari": 18, "accordingli": 18, "suffix": 18, "support": 18, "semi": 18, "strictli": 18, "adher": 18, "aforement": 18, "primari": 18, "_util": 18, "_utils_const": 18, "py": 18, "modular": 18, "easili": 18, "therefor": 18, "flat": 18, "hierarchi": 18, "program": 18, "outlin": 18, "softwar": 18, "user": 18, "friendli": 18, "hint": 18, "enhanc": 18, "propos": 18, "pep": 18, "484": 18, "book": 18, "error": 18, "messag": 18, "docstr": 18, "257": 18, "guid": 18, "markup": 18, "languag": 18, "restructuredtext": 18, "rst": 18, "primer": 18, "autodoc": 18, "sphinx": 18, "inclus": 18, "napoleon": 18, "extens": 18, "conf": 18, "go": 18, "_build": 18, "browser": 18, "citat": 19, "wa": 20, "develop": 20, "typic": 20, "et": 21, "al": 21, "2023a": 21, "2023b": 21, "2023c": 21, "chart": 21, "\u03b3": 21, "cheng": 21, "2006": 21, "larg": 21, "disulphid": 21, "kernel": 21, "recurs": 21, "neural": 21, "network": 21, "graph": 21, "match": 21, "struct": 21, "funct": 21, "kawashima": 21, "2008": 21, "aid": 21, "databas": 21, "report": 21, "nucleic": 21, "magnan": 21, "randal": 21, "baldi": 21, "2009": 21, "accur": 21, "bioinformat": 21, "galiez": 21, "2016": 21, "viral": 21, "song": 21, "2018": 21, "throughput": 21, "90": 21, "proteas": 21, "accuraci": 21, "shen": 21, "2019": 21, "local": 21, "evolutionari": 21, "chou": 21, "pseaac": 21, "j": 21, "theor": 21, "biol": 21, "tang": 21, "2020": 21, "teng": 21, "2021": 21, "pseudo": 21, "composit": 21, "tripeptid": 21, "bmc": 21, "yang": 21, "granular": 21, "multipl": 21, "appl": 21, "chronolog": 22, "histori": 22, "overview_benchmark": 22, "overview_scal": 22, "view": 24}, "objects": {"aaanalysis": [[5, 0, 1, "", "AAclust"], [6, 0, 1, "", "CPP"], [7, 0, 1, "", "CPPPlot"], [8, 0, 1, "", "SequenceFeature"], [9, 0, 1, "", "dPULearn"], [10, 3, 1, "", "load_dataset"], [11, 3, 1, "", "load_scales"], [12, 3, 1, "", "plot_gcfs"], [13, 3, 1, "", "plot_get_cdict"], [14, 3, 1, "", "plot_get_cmap"], [15, 3, 1, "", "plot_set_legend"], [16, 3, 1, "", "plot_settings"]], "aaanalysis.AAclust": [[5, 1, 1, "", "__init__"], [5, 2, 1, "", "center_labels_"], [5, 2, 1, "", "centers_"], [5, 1, 1, "", "cluster_naming"], [5, 1, 1, "", "correlation"], [5, 1, 1, "", "eval"], [5, 1, 1, "", "fit"], [5, 1, 1, "", "get_cluster_centers"], [5, 1, 1, "", "get_cluster_medoids"], [5, 2, 1, "", "labels_"], [5, 2, 1, "", "medoid_ind_"], [5, 2, 1, "", "medoid_labels_"], [5, 2, 1, "", "medoids_"], [5, 2, 1, "", "n_clusters"]], "aaanalysis.CPP": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "eval"], [6, 1, 1, "", "run"]], "aaanalysis.CPPPlot": [[7, 1, 1, "", "__init__"], [7, 1, 1, "", "heatmap"], [7, 1, 1, "", "profile"], [7, 1, 1, "", "update_seq_size"]], "aaanalysis.SequenceFeature": [[8, 1, 1, "", "__init__"], [8, 1, 1, "", "add_dif"], [8, 1, 1, "", "add_feat_value"], [8, 1, 1, "", "add_position"], [8, 1, 1, "", "feat_matrix"], [8, 1, 1, "", "feat_names"], [8, 1, 1, "", "get_df_parts"], [8, 1, 1, "", "get_features"], [8, 1, 1, "", "get_split_kws"]], "aaanalysis.dPULearn": [[9, 1, 1, "", "__init__"], [9, 1, 1, "", "eval"], [9, 1, 1, "", "fit"], [9, 2, 1, "", "labels_"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:attribute", "3": "py:function"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "function", "Python function"]}, "titleterms": {"tabl": [2, 17, 22], "overview": [2, 17, 22], "protein": [2, 22, 24], "benchmark": [2, 21, 22], "dataset": [2, 21, 22], "amino": 2, "acid": 2, "scale": 2, "data": [3, 4, 24], "load": [3, 24], "api": 4, "featur": [4, 24], "engin": [4, 24], "pu": 4, "learn": 4, "explain": 4, "ai": 4, "perturb": 4, "plot": 4, "util": 4, "aaanalysi": [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], "aaclust": 5, "note": [5, 6, 8, 9, 10, 16], "cpp": 6, "cppplot": 7, "exampl": [7, 8, 9, 15, 16, 17], "sequencefeatur": 8, "dpulearn": 9, "load_dataset": 10, "load_scal": 11, "plot_gcf": 12, "plot_get_cdict": 13, "plot_get_cmap": 14, "plot_set_legend": 15, "plot_set": 16, "welcom": 17, "document": [17, 18], "instal": [17, 18], "refer": [17, 21], "indic": 17, "citat": 17, "contribut": 18, "introduct": [18, 20], "vision": 18, "object": 18, "non": 18, "goal": 18, "principl": [18, 23], "bug": 18, "report": 18, "latest": 18, "version": 18, "local": 18, "develop": 18, "environ": 18, "fork": 18, "clone": 18, "depend": 18, "run": 18, "unit": 18, "test": 18, "pull": 18, "request": 18, "preview": 18, "chang": 18, "name": 18, "convent": 18, "class": 18, "templat": 18, "function": 18, "method": 18, "code": 18, "philosophi": 18, "style": 18, "build": 18, "doc": 18, "workflow": 20, "algorithm": 21, "us": 21, "case": 21, "further": 21, "inform": 21, "usag": 23, "tutori": 24, "redund": 24, "reduct": 24, "identif": 24, "neg": 24, "predict": 24}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx": 57}, "alltitles": {"Tables": [[2, "tables"], [22, "tables"]], "Overview Table": [[2, "overview-table"], [22, "overview-table"]], "Protein benchmark datasets": [[2, "protein-benchmark-datasets"], [22, "protein-benchmark-datasets"]], "Amino acid scale datasets": [[2, "amino-acid-scale-datasets"]], "Data Loading": [[3, "data-loading"]], "API": [[4, "api"]], "Data": [[4, "data"]], "Feature Engineering": [[4, "feature-engineering"]], "PU Learning": [[4, "pu-learning"]], "Explainable AI": [[4, "explainable-ai"]], "Perturbation": [[4, "perturbation"]], "Plot Utilities": [[4, "plot-utilities"]], "aaanalysis.AAclust": [[5, "aaanalysis-aaclust"]], "Notes": [[5, null], [6, null], [6, null], [8, null], [8, null], [8, null], [8, null], [8, null], [9, null], [9, null], [10, null], [16, null]], "aaanalysis.CPP": [[6, "aaanalysis-cpp"]], "aaanalysis.CPPPlot": [[7, "aaanalysis-cppplot"]], "Examples": [[7, null], [8, null], [8, null], [9, null], [15, null], [16, null]], "aaanalysis.SequenceFeature": [[8, "aaanalysis-sequencefeature"]], "aaanalysis.dPULearn": [[9, "aaanalysis-dpulearn"]], "aaanalysis.load_dataset": [[10, "aaanalysis-load-dataset"]], "aaanalysis.load_scales": [[11, "aaanalysis-load-scales"]], "aaanalysis.plot_gcfs": [[12, "aaanalysis-plot-gcfs"]], "aaanalysis.plot_get_cdict": [[13, "aaanalysis-plot-get-cdict"]], "aaanalysis.plot_get_cmap": [[14, "aaanalysis-plot-get-cmap"]], "aaanalysis.plot_set_legend": [[15, "aaanalysis-plot-set-legend"]], "aaanalysis.plot_settings": [[16, "aaanalysis-plot-settings"]], "Welcome to the AAanalysis documentation": [[17, "welcome-to-the-aaanalysis-documentation"]], "Install": [[17, "install"]], "OVERVIEW": [[17, null]], "EXAMPLES": [[17, null]], "REFERENCES": [[17, null]], "Indices and tables": [[17, "indices-and-tables"]], "Citation": [[17, "citation"]], "Contributing": [[18, "contributing"]], "Introduction": [[18, "introduction"], [20, "introduction"]], "Vision": [[18, "vision"]], "Objectives": [[18, "objectives"]], "Non-goals": [[18, "non-goals"]], "Principles": [[18, "principles"]], "Bug Reports": [[18, "bug-reports"]], "Installation": [[18, "installation"]], "Latest Version": [[18, "latest-version"]], "Local Development Environment": [[18, "local-development-environment"]], "Fork and Clone": [[18, "fork-and-clone"]], "Install Dependencies": [[18, "install-dependencies"]], "Run Unit Tests": [[18, "run-unit-tests"]], "Pull Requests": [[18, "pull-requests"]], "Preview Changes": [[18, "preview-changes"]], "Documentation": [[18, "documentation"]], "Naming Conventions": [[18, "naming-conventions"]], "Class Templates": [[18, "class-templates"]], "Function and Method Naming": [[18, "function-and-method-naming"]], "Code Philosophy": [[18, "code-philosophy"]], "Documentation Style": [[18, "documentation-style"]], "Building the Docs": [[18, "building-the-docs"]], "Workflow": [[20, "workflow"]], "References": [[21, "references"]], "Algorithms": [[21, "algorithms"]], "Datasets and Benchmarks": [[21, "datasets-and-benchmarks"]], "Use Cases": [[21, "use-cases"]], "Further Information": [[21, "further-information"]], "Usage Principles": [[23, "usage-principles"]], "Tutorials": [[24, "tutorials"]], "Data loading": [[24, "data-loading"]], "Redundancy-reduction": [[24, "redundancy-reduction"]], "Feature engineering": [[24, "feature-engineering"]], "Identification of negatives": [[24, "identification-of-negatives"]], "Protein prediction": [[24, "protein-prediction"]]}, "indexentries": {"aaclust (class in aaanalysis)": [[5, "aaanalysis.AAclust"]], "__init__() (aaanalysis.aaclust method)": [[5, "aaanalysis.AAclust.__init__"]], "center_labels_ (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.center_labels_"]], "centers_ (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.centers_"]], "cluster_naming() (aaanalysis.aaclust method)": [[5, "aaanalysis.AAclust.cluster_naming"]], "correlation() (aaanalysis.aaclust static method)": [[5, "aaanalysis.AAclust.correlation"]], "eval() (aaanalysis.aaclust method)": [[5, "aaanalysis.AAclust.eval"]], "fit() (aaanalysis.aaclust method)": [[5, "aaanalysis.AAclust.fit"]], "get_cluster_centers() (aaanalysis.aaclust static method)": [[5, "aaanalysis.AAclust.get_cluster_centers"]], "get_cluster_medoids() (aaanalysis.aaclust static method)": [[5, "aaanalysis.AAclust.get_cluster_medoids"]], "labels_ (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.labels_"]], "medoid_ind_ (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.medoid_ind_"]], "medoid_labels_ (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.medoid_labels_"]], "medoids_ (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.medoids_"]], "n_clusters (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.n_clusters"]], "cpp (class in aaanalysis)": [[6, "aaanalysis.CPP"]], "__init__() (aaanalysis.cpp method)": [[6, "aaanalysis.CPP.__init__"]], "eval() (aaanalysis.cpp static method)": [[6, "aaanalysis.CPP.eval"]], "run() (aaanalysis.cpp method)": [[6, "aaanalysis.CPP.run"]], "cppplot (class in aaanalysis)": [[7, "aaanalysis.CPPPlot"]], "__init__() (aaanalysis.cppplot method)": [[7, "aaanalysis.CPPPlot.__init__"]], "heatmap() (aaanalysis.cppplot method)": [[7, "aaanalysis.CPPPlot.heatmap"]], "profile() (aaanalysis.cppplot method)": [[7, "aaanalysis.CPPPlot.profile"]], "update_seq_size() (aaanalysis.cppplot method)": [[7, "aaanalysis.CPPPlot.update_seq_size"]], "sequencefeature (class in aaanalysis)": [[8, "aaanalysis.SequenceFeature"]], "__init__() (aaanalysis.sequencefeature method)": [[8, "aaanalysis.SequenceFeature.__init__"]], "add_dif() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.add_dif"]], "add_feat_value() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.add_feat_value"]], "add_position() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.add_position"]], "feat_matrix() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.feat_matrix"]], "feat_names() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.feat_names"]], "get_df_parts() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.get_df_parts"]], "get_features() (aaanalysis.sequencefeature method)": [[8, "aaanalysis.SequenceFeature.get_features"]], "get_split_kws() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.get_split_kws"]], "__init__() (aaanalysis.dpulearn method)": [[9, "aaanalysis.dPULearn.__init__"]], "dpulearn (class in aaanalysis)": [[9, "aaanalysis.dPULearn"]], "eval() (aaanalysis.dpulearn method)": [[9, "aaanalysis.dPULearn.eval"]], "fit() (aaanalysis.dpulearn method)": [[9, "aaanalysis.dPULearn.fit"]], "labels_ (aaanalysis.dpulearn attribute)": [[9, "aaanalysis.dPULearn.labels_"]], "load_dataset() (in module aaanalysis)": [[10, "aaanalysis.load_dataset"]], "load_scales() (in module aaanalysis)": [[11, "aaanalysis.load_scales"]], "plot_gcfs() (in module aaanalysis)": [[12, "aaanalysis.plot_gcfs"]], "plot_get_cdict() (in module aaanalysis)": [[13, "aaanalysis.plot_get_cdict"]], "plot_get_cmap() (in module aaanalysis)": [[14, "aaanalysis.plot_get_cmap"]], "plot_set_legend() (in module aaanalysis)": [[15, "aaanalysis.plot_set_legend"]], "plot_settings() (in module aaanalysis)": [[16, "aaanalysis.plot_settings"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["_index/badges", "_index/overview", "_index/tables", "_index/usage_principles/data_loading", "api", "generated/aaanalysis.AAclust", "generated/aaanalysis.CPP", "generated/aaanalysis.CPPPlot", "generated/aaanalysis.SequenceFeature", "generated/aaanalysis.dPULearn", "generated/aaanalysis.load_dataset", "generated/aaanalysis.load_scales", "generated/aaanalysis.plot_gcfs", "generated/aaanalysis.plot_get_cdict", "generated/aaanalysis.plot_get_cmap", "generated/aaanalysis.plot_set_legend", "generated/aaanalysis.plot_settings", "index", "index/CONTRIBUTING_COPY", "index/citations", "index/introduction", "index/references", "index/tables_template", "index/usage_principles", "tutorials"], "filenames": ["_index/badges.rst", "_index/overview.rst", "_index/tables.rst", "_index/usage_principles/data_loading.rst", "api.rst", "generated/aaanalysis.AAclust.rst", "generated/aaanalysis.CPP.rst", "generated/aaanalysis.CPPPlot.rst", "generated/aaanalysis.SequenceFeature.rst", "generated/aaanalysis.dPULearn.rst", "generated/aaanalysis.load_dataset.rst", "generated/aaanalysis.load_scales.rst", "generated/aaanalysis.plot_gcfs.rst", "generated/aaanalysis.plot_get_cdict.rst", "generated/aaanalysis.plot_get_cmap.rst", "generated/aaanalysis.plot_set_legend.rst", "generated/aaanalysis.plot_settings.rst", "index.rst", "index/CONTRIBUTING_COPY.rst", "index/citations.rst", "index/introduction.rst", "index/references.rst", "index/tables_template.rst", "index/usage_principles.rst", "tutorials.rst"], "titles": ["<no title>", "<no title>", "Tables", "Data Loading", "API", "aaanalysis.AAclust", "aaanalysis.CPP", "aaanalysis.CPPPlot", "aaanalysis.SequenceFeature", "aaanalysis.dPULearn", "aaanalysis.load_dataset", "aaanalysis.load_scales", "aaanalysis.plot_gcfs", "aaanalysis.plot_get_cdict", "aaanalysis.plot_get_cmap", "aaanalysis.plot_set_legend", "aaanalysis.plot_settings", "Welcome to the AAanalysis documentation", "Contributing", "<no title>", "Introduction", "References", "Tables", "Usage Principles", "Tutorials"], "terms": {"aaanalysi": [1, 4, 18, 19, 20, 22, 23], "amino": [1, 3, 5, 6, 7, 8, 10, 11, 17, 19, 20, 21, 22], "acid": [1, 3, 5, 6, 7, 8, 10, 11, 17, 19, 20, 21, 22], "analysi": [1, 9, 10, 11, 17, 18, 20], "i": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 17, 18, 20], "python": [1, 17, 18, 20], "framework": [1, 5, 17, 20], "interpret": [1, 14, 17, 18, 19, 20, 21], "sequenc": [1, 2, 3, 6, 7, 8, 9, 10, 11, 17, 18, 20, 21], "base": [1, 5, 6, 7, 8, 9, 14, 17, 18, 20, 21], "protein": [1, 7, 8, 10, 17, 18, 20, 21], "predict": [1, 2, 3, 17, 18, 20, 21], "provid": [1, 3, 5, 6, 7, 9, 11, 14, 17, 18], "follow": [1, 3, 4, 5, 6, 8, 9, 17, 18, 19, 20], "algorithm": [1, 6, 7, 17, 18, 20], "aaclust": [1, 17, 19, 20, 21, 24], "k": [1, 5, 17, 20, 21], "optim": [1, 5, 6, 7, 17, 20, 21], "cluster": [1, 5, 17, 20, 21], "wrapper": [1, 7, 17, 18, 20], "select": [1, 5, 6, 7, 10, 11, 17, 20, 21], "redund": [1, 5, 6, 17, 20, 21], "reduc": [1, 5, 9, 17, 20, 21], "set": [1, 2, 5, 6, 7, 8, 9, 10, 12, 15, 16, 17, 18, 20, 21], "numer": [1, 5, 7, 8, 17, 20], "scale": [1, 5, 6, 7, 8, 11, 13, 14, 16, 17, 19, 20, 21, 22], "e": [1, 3, 7, 8, 13, 14, 16, 17, 18, 20], "g": [1, 3, 7, 8, 13, 14, 16, 17, 18, 20], "cpp": [1, 7, 8, 14, 17, 19, 20, 24], "compar": [1, 17, 20], "physicochem": [1, 6, 8, 17, 20, 21], "profil": [1, 7, 13, 14, 17, 20], "featur": [1, 5, 6, 7, 8, 9, 14, 17, 18, 20], "engin": [1, 17, 18, 20], "two": [1, 6, 7, 17, 18, 20, 21], "identifi": [1, 6, 7, 9, 17, 20, 21], "most": [1, 5, 6, 7, 9, 17, 20], "distinct": [1, 8, 17, 20], "dpulearn": [1, 17, 19, 20, 24], "determinist": [1, 9, 17, 20], "posit": [1, 2, 3, 5, 6, 7, 8, 9, 14, 17, 20], "unlabel": [1, 3, 9, 17, 20], "pu": [1, 2, 3, 9, 17, 20], "learn": [1, 3, 5, 9, 17, 18, 19, 20, 21], "enabl": [1, 5, 6, 7, 8, 9, 16, 17, 18, 20], "train": [1, 17, 18, 20], "unbalanc": [1, 17, 18, 20], "small": [1, 9, 17, 18, 20], "dataset": [1, 3, 5, 6, 10, 11, 17, 18, 20], "moreov": [1, 17], "function": [1, 4, 7, 12, 14, 16, 17], "load": [1, 10, 11, 17, 18], "benchmark": [1, 3, 10, 17], "load_data": [1, 17], "load_scal": [1, 2, 8, 17, 22], "depth": [1, 17], "level": [1, 2, 7, 10, 17], "classif": [1, 2, 3, 7, 11, 17], "aaontologi": [1, 2, 11, 17, 19, 21], "descript": [2, 18, 22], "see": [2, 3, 7, 10, 18, 22], "also": [2, 3, 9, 18, 22], "1_overview_benchmark": 2, "aa": [2, 3, 4, 6, 8, 9, 10, 15, 16, 22, 23], "load_dataset": [2, 4, 8, 22], "2_overview_scal": 2, "neg": [2, 5, 8, 9, 14], "predictor": 2, "refer": [2, 4, 5, 6, 8], "label": [2, 5, 6, 7, 8, 9, 15, 18], "aa_caspase3": 2, "233": 2, "185605": 2, "705": 2, "184900": 2, "prosper": [2, 21], "caspas": 2, "3": [2, 5, 8, 9, 15, 18], "cleavag": [2, 21], "site": [2, 21], "song18": [2, 21], "1": [2, 3, 5, 6, 7, 8, 9, 15, 16], "adjac": 2, "0": [2, 5, 6, 7, 8, 9, 15, 16], "aa_furin": 2, "71": 2, "59003": 2, "163": 2, "58840": 2, "furin": 2, "aa_ldr": [2, 3], "342": 2, "118248": 2, "35469": 2, "82779": 2, "idp": [2, 21], "seq2seq": [2, 21], "long": 2, "intrins": [2, 21], "disord": [2, 21], "region": [2, 21], "ldr": 2, "tang20": [2, 21], "order": [2, 5, 22], "aa_mmp2": 2, "573": 2, "312976": 2, "2416": 2, "310560": 2, "matrix": [2, 5, 8, 9], "metallopeptidas": 2, "2": [2, 3, 5, 6, 7, 8, 9, 15], "mmp2": 2, "aa_rnabind": 2, "221": 2, "55001": 2, "6492": 2, "48509": 2, "gmksvm": 2, "ru": 2, "rna": [2, 21], "bind": [2, 21], "residu": [2, 3, 11, 21], "rbp60": 2, "yang21": [2, 21], "non": [2, 6, 8, 10], "aa_sa": 2, "101082": 2, "84523": 2, "solvent": 2, "access": [2, 4], "sa": 2, "data": [2, 5, 7, 9, 17, 18], "expos": 2, "buri": 2, "seq_amylo": [2, 3], "1414": 2, "8484": 2, "511": 2, "903": 2, "rerf": [2, 21], "pred": [2, 21], "amyloidognen": 2, "teng21": [2, 21], "amyloidogen": [2, 21], "seq_capsid": 2, "7935": 2, "3364680": 2, "3864": 2, "4071": 2, "viralpro": [2, 21], "capdsid": 2, "galiez16": [2, 21], "capsid": [2, 21], "seq_disulfid": 2, "2547": 2, "614470": 2, "897": 2, "1650": 2, "dipro": 2, "disulfid": 2, "bridg": [2, 21], "cheng06": [2, 21], "ss": 2, "bond": 2, "without": [2, 5, 7, 18], "seq_loc": 2, "1835": 2, "732398": 2, "1045": 2, "790": 2, "nan": 2, "subcellular": [2, 21], "locat": [2, 15], "cytoplasm": 2, "v": 2, "plasma": 2, "membran": [2, 8], "shen19": [2, 21], "seq_solubl": 2, "17408": 2, "4432269": 2, "8704": 2, "solpro": [2, 21], "solubl": [2, 21], "insolubl": 2, "magnan09": [2, 21], "seq_tail": 2, "6668": 2, "2671690": 2, "2574": 2, "4094": 2, "tail": [2, 21], "domain": [2, 3, 8, 10], "dom_gsec": [2, 3], "126": 2, "92964": 2, "63": 2, "gamma": 2, "secretas": [2, 21], "substrat": [2, 21], "breimann23c": [2, 21], "dom_gsec_pu": [2, 3], "694": 2, "494524": 2, "unknown": 2, "statu": 2, "min": 2, "max": 2, "normal": [2, 7, 15], "586": 2, "breimann23b": [2, 17, 19, 21], "scales_raw": [2, 11], "raw": 2, "valu": [2, 5, 6, 7, 8, 18, 20], "kawashima08": [2, 21], "scales_classif": 2, "scales_pc": [2, 11], "princip": [2, 9], "compon": [2, 8, 9], "pc": [2, 9], "compress": 2, "20": [2, 6, 7, 8, 18], "breimann23a": [2, 10, 11, 21], "top60": [2, 11], "top": [2, 5], "60": 2, "subset": [2, 8], "top60_ev": [2, 11], "evalu": [2, 6, 11, 18], "three": [3, 5, 8, 14], "type": [3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 18], "ar": [3, 6, 7, 8, 9, 10, 11, 18, 22], "us": [3, 5, 6, 7, 9, 10, 14, 16, 17, 18, 19, 20], "specif": [3, 5, 13], "properti": [3, 8], "dom": [3, 10], "seq": [3, 10], "The": [3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 18], "each": [3, 5, 6, 7, 8, 9], "indic": [3, 5, 7, 8, 9], "first": [3, 6, 7, 8, 14, 18], "part": [3, 6, 7, 8, 18], "name": [3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16], "an": [3, 9, 10, 17, 18, 19, 21], "abbrevi": 3, "For": [3, 4, 5, 8, 15, 18], "some": 3, "addit": [3, 7, 8, 9, 15, 16], "version": 3, "contain": [3, 4, 6, 7, 9, 10, 18], "onli": [3, 7, 11, 18], "sampl": [3, 5, 6, 7, 8, 9], "dataset_nam": 3, "_pu": 3, "thi": [4, 5, 7, 16, 18], "page": [4, 17], "public": [4, 17, 18, 19], "object": [4, 5, 7, 8, 9], "more": [4, 7, 18], "exampl": [4, 18], "practic": 4, "usag": [4, 17], "our": [4, 18], "notebook": [4, 24], "conveni": 4, "common": [4, 18], "import": [4, 8, 9, 15, 16, 18, 23], "modul": [4, 5, 17], "Then": 4, "you": [4, 17, 18, 19], "can": [4, 5, 8, 9, 11, 15, 17, 18, 20], "all": [4, 5, 6, 7, 8, 16, 18, 22], "method": [4, 5, 6, 7, 8, 9, 21], "via": [4, 18, 21], "alia": [4, 8], "class": [5, 6, 7, 8, 9, 10], "model": [5, 9, 18], "none": [5, 6, 7, 8, 9, 10, 13, 14, 15], "model_kwarg": 5, "verbos": [5, 6, 7, 8, 9, 16], "fals": [5, 6, 7, 8, 9, 11, 14, 15, 16], "sourc": [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18], "A": [5, 8, 11, 15, 18, 20, 21], "design": [5, 7, 18], "primarili": [5, 9, 18], "versatil": 5, "enough": 5, "ani": [5, 18, 20], "It": [5, 20], "take": 5, "requir": 5, "pre": [5, 6, 18], "defin": [5, 8, 18], "number": [5, 6, 7, 8, 9, 10, 14, 15], "from": [5, 6, 7, 8, 9, 11, 17, 18, 22], "scikit": [5, 18], "http": [5, 18], "org": [5, 18], "stabl": 5, "html": [5, 18], "By": 5, "leverag": 5, "pearson": [5, 6], "correl": [5, 6], "similar": 5, "measur": [5, 18], "one": [5, 7], "repres": [5, 7, 20], "term": 5, "medoid": 5, "which": [5, 7, 8, 12, 20], "closest": 5, "": [5, 15, 21], "center": [5, 14], "yield": 5, "paramet": [5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16], "callabl": 5, "option": [5, 6, 7, 8, 9, 10, 11, 14, 16], "default": [5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16], "sklearn": 5, "kmean": 5, "emploi": [5, 9], "given": [5, 7, 8, 11, 22], "n_cluster": 5, "dict": [5, 6, 7, 8, 9, 13, 14, 15], "dictionari": [5, 6, 7, 8, 13, 14, 15], "keyword": [5, 7, 9], "argument": [5, 7, 8, 9, 15], "pass": [5, 7, 9, 15, 18], "bool": [5, 6, 7, 8, 9, 11, 14, 15, 16], "flag": 5, "disabl": [5, 10], "output": [5, 8, 9, 16], "obtain": [5, 8], "int": [5, 6, 7, 8, 9, 10, 14, 15], "labels_": [5, 9], "arrai": [5, 6, 8, 9], "like": [5, 6, 8, 9, 18], "centers_": 5, "averag": [5, 8], "correspond": [5, 18], "center_labels_": 5, "medoids_": 5, "medoid_labels_": 5, "medoid_ind_": 5, "chosen": [5, 6, 8], "within": [5, 6, 8], "origin": 5, "__init__": [5, 6, 7, 8, 9], "fit": [5, 9, 18], "x": [5, 7, 9, 10, 15, 16], "on_cent": 5, "true": [5, 6, 7, 8, 11, 15, 16], "min_th": 5, "merge_metr": 5, "euclidean": [5, 9], "format": [5, 16], "determin": 5, "partit": 5, "maxim": 5, "beyond": 5, "threshold": [5, 6], "qualiti": 5, "either": [5, 8, 17], "minimum": [5, 8, 10], "member": 5, "min_cor": 5, "between": [5, 6, 7, 8, 14, 15], "its": [5, 18], "govern": 5, "undergo": 5, "stage": 5, "estim": 5, "lower": 5, "bound": 5, "refin": 5, "metric": [5, 9, 18], "merg": 5, "smaller": 5, "direct": 5, "final": 5, "reduct": 5, "shape": [5, 6, 7, 8, 9, 15], "n_sampl": [5, 6, 8, 9], "n_featur": [5, 6, 7, 8, 9], "where": [5, 8, 9], "list": [5, 7, 8, 14, 15], "str": [5, 7, 8, 9, 10, 11, 13, 14, 15, 16], "If": [5, 6, 7, 8, 9, 10, 14, 16, 17, 18, 19], "return": [5, 6, 7, 8, 9, 10, 11, 13, 14, 15], "appli": [5, 9, 14, 15, 16], "otherwis": [5, 7, 8, 9], "float": [5, 6, 7, 9, 14, 15, 16], "instead": 5, "names_medoid": 5, "attribut": 5, "attr": 5, "further": [5, 7, 10, 18], "inform": [5, 6, 7, 8, 9, 10], "paper": 5, "todo": [5, 6, 11], "add": [5, 6, 7, 8], "link": [5, 6, 17, 18, 19, 21], "cluster_nam": 5, "name_unclassifi": 5, "unclassifi": [5, 11], "assign": [5, 7, 8, 9], "frequenc": 5, "renam": 5, "prioriti": 5, "frequent": 5, "alreadi": 5, "doe": 5, "exist": [5, 18], "cannot": 5, "classifi": [5, 7], "static": [5, 6, 8], "get_cluster_cent": 5, "comput": [5, 6, 7, 8, 18, 21], "center_label": 5, "associ": 5, "get_cluster_medoid": 5, "medoid_label": 5, "medoid_ind": 5, "index": [5, 17, 18, 21], "x_test": 5, "x_ref": 5, "labels_test": 5, "labels_ref": 5, "n": [5, 6, 7, 8, 10, 18, 21], "except_unclassifi": 5, "test": [5, 6], "consid": [5, 18], "strength": 5, "els": 5, "exclud": 5, "list_top_center_name_corr": 5, "have": [5, 18], "strongest": 5, "eval": [5, 6, 9, 18], "df_scale": [6, 8], "df_cat": [6, 7, 8], "df_part": [6, 8], "split_kw": [6, 8], "accept_gap": [6, 7, 8], "tool": [6, 18, 21], "creat": [6, 7, 8, 9, 18], "filter": [6, 7, 10], "discrimin": [6, 7], "panda": [6, 7, 8, 9, 11, 18], "datafram": [6, 7, 8, 9, 10, 11, 18], "load_categori": [6, 8], "categori": [6, 7, 8, 11, 13, 14, 15], "sequencefeatur": 6, "get_split_kw": [6, 8], "nest": [6, 8], "split_typ": [6, 8], "whether": [6, 7, 8, 11, 14, 15], "accept": [6, 7, 8], "miss": [6, 7, 8], "omit": [6, 7, 8], "print": [6, 7, 8], "progress": [6, 7, 21], "about": [6, 7], "run": [6, 8], "perform": [6, 9], "step": [6, 7, 8, 20], "parametr": 6, "n_filter": 6, "100": [6, 14], "tmd_len": [6, 7, 8], "jmd_n_len": [6, 7, 8], "10": [6, 7, 8, 14], "jmd_c_len": [6, 7, 8], "ext_len": [6, 7, 8], "4": [6, 7, 8], "start": [6, 7, 8, 18], "check_cat": 6, "n_pre_filt": 6, "pct_pre_filt": 6, "5": [6, 7, 8, 9, 15], "max_std_test": 6, "max_overlap": 6, "max_cor": 6, "n_process": 6, "pipelin": [6, 18], "creation": 6, "aim": [6, 7, 18], "collect": 6, "group": [6, 7, 8], "t": 6, "u": [6, 17, 18], "p": [6, 21], "percentag": [6, 9, 14], "length": [6, 7, 8, 10], "tmd": [6, 7, 8], "explan": [6, 7], "terminu": [6, 7, 8], "jmd": [6, 7, 8], "c": [6, 7, 8, 17, 21], "extend": [6, 7, 8, 18], "termin": [6, 7, 8], "should": [6, 7, 8, 9, 11, 18], "longer": 6, "than": 6, "check": [6, 18], "remain": [6, 18], "after": 6, "maximum": [6, 8, 9, 10], "standard": 6, "deviat": 6, "overlap": 6, "cpu": 6, "multiprocess": 6, "automat": [6, 7, 9, 18], "df_feat": [6, 7, 8], "uniqu": [6, 7], "statist": [6, 7], "n_feature_inform": [6, 7], "eleven": 6, "column": [6, 7, 8, 9, 10, 15, 18], "includ": [6, 8, 11, 14, 15, 18], "id": [6, 8], "result": 6, "rank": 6, "11": [6, 7, 15], "split": [6, 8], "subcategori": [6, 7], "sub": 6, "scale_nam": [6, 7], "abs_auc": [6, 7], "absolut": 6, "adjust": [6, 7, 16], "auc": 6, "abs_mean_dif": 6, "mean": [6, 7], "differ": [6, 7, 8, 15], "std_test": [6, 7], "std_ref": 6, "p_val": 6, "mann_whitnei": 6, "ttest_indep": 6, "p_val_fdr_bh": 6, "benjamini": 6, "hochberg": 6, "fdr": 6, "correct": 6, "get": [6, 8, 12], "condit": [7, 8], "jmd_m_len": [7, 8], "y": [7, 15, 16], "val_col": 7, "mean_dif": 7, "val_typ": 7, "count": [7, 11], "figsiz": 7, "7": [7, 8, 9, 16], "titl": [7, 15], "title_kw": 7, "dict_color": [7, 13, 14, 15], "edge_color": 7, "bar_width": 7, "75": 7, "add_jmd_tmd": 7, "jmd_n_seq": 7, "tmd_seq": 7, "jmd_c_seq": 7, "tmd_color": 7, "mediumspringgreen": 7, "jmd_color": 7, "blue": [7, 15], "tmd_seq_color": 7, "black": [7, 18], "jmd_seq_color": 7, "white": 7, "seq_siz": 7, "tmd_jmd_fontsiz": 7, "xtick_siz": 7, "xtick_width": 7, "xtick_length": 7, "xticks_po": 7, "ytick_siz": 7, "ytick_width": 7, "ytick_length": 7, "ylim": 7, "highlight_tmd_area": 7, "highlight_alpha": 7, "15": [7, 8], "grid": [7, 16], "grid_axi": [7, 16], "both": [7, 16], "add_legend_cat": 7, "legend_kw": 7, "shap_plot": 7, "kwarg": [7, 8, 15], "plot": [7, 13, 14, 15, 16, 17, 18], "instanc": 7, "avail": [7, 17, 19, 21], "specifi": [7, 8, 9, 13, 14, 16, 18], "check_value_typ": 7, "tupl": [7, 14], "size": [7, 8, 12, 14, 15, 16], "custom": [7, 15, 16], "appear": [7, 16], "map": [7, 8, 14, 15], "color": [7, 13, 14, 15], "edg": [7, 15, 18], "bar": [7, 13, 14], "width": [7, 15], "line": [7, 15], "annot": 7, "font": [7, 12, 15, 16], "tick": [7, 16], "axi": [7, 16], "limit": 7, "highlight": 7, "area": 7, "alpha": 7, "ad": 7, "drawn": 7, "legend": [7, 15], "shap": [7, 14, 18], "shaplei": 7, "gener": [7, 8, 14, 16, 18, 20, 21], "other": [7, 11, 18], "intern": 7, "librari": [7, 16, 18], "ax": [7, 15], "matplotlib": [7, 15, 16, 18], "heatmap": [7, 13, 14], "8": [7, 8, 9, 18], "vmin": 7, "vmax": 7, "grid_on": 7, "cmap": [7, 13, 14], "rdbu_r": 7, "cmap_n_color": 7, "cbar_kw": 7, "facecolor_dark": [7, 14], "add_importance_map": 7, "cbar_pct": 7, "featuremap": 7, "versu": 7, "seaborn": [7, 14, 16, 18], "shown": 7, "feat_impact": 7, "displai": 7, "sum": 7, "std": 7, "aggreg": 7, "positions_onli": 7, "across": 7, "recommend": [7, 9, 18], "when": [7, 9], "emphas": [7, 18], "fewer": 7, "value_typ": 7, "height": 7, "figur": 7, "inch": 7, "pyplot": [7, 15], "anchor": [7, 15], "colormap": 7, "infer": [7, 18], "seismic": 7, "space": [7, 9, 14, 15], "impact": 7, "discret": 7, "diverg": 7, "sequenti": 7, "kei": [7, 18], "colorbar": 7, "under": [7, 18], "depicet": 7, "depict": 7, "jmd_n": [7, 8], "jmd_c": [7, 8], "point": [7, 15], "set_xticklabel": 7, "widht": 7, "tick_param": 7, "pcolormesh": 7, "effect": [7, 18], "align": [7, 15], "applic": 7, "document": [7, 22], "detail": [7, 10, 15, 17, 18, 19], "code": [7, 14], "update_seq_s": 7, "retriev": [8, 13, 14], "continu": 8, "transmembran": 8, "principl": [8, 17], "segment": 8, "pattern": 8, "express": 8, "present": 8, "realiz": 8, "over": 8, "valid": [8, 18], "tmd_e": 8, "tmd_n": 8, "tmd_c": 8, "ext_c": 8, "ext_n": 8, "tmd_jmd": 8, "jmd_n_tmd_n": 8, "tmd_c_jmd_c": 8, "ext_n_tmd_n": 8, "tmd_c_ext_c": 8, "get_df_part": 8, "df_seq": [8, 9, 10], "list_part": 8, "all_part": 8, "datafran": 8, "compris": 8, "tmd_start": 8, "tmd_stop": 8, "string": [8, 14], "len": 8, "must": 8, "lenght": 8, "resp": 8, "extra": 8, "possibl": 8, "found": [8, 11, 18], "sf": 8, "gsec_sub_seq": 8, "n_split_min": 8, "n_split_max": 8, "steps_pattern": 8, "n_min": 8, "n_max": 8, "len_max": 8, "steps_periodicpattern": 8, "periodicpattern": 8, "greater": 8, "greatest": 8, "whole": [8, 10], "specfii": 8, "smallest": 8, "integ": 8, "6": 8, "vari": 8, "paramt": 8, "argumetn": 8, "get_featur": 8, "combin": [8, 18], "form": 8, "feat_matrix": 8, "n_job": 8, "return_label": 8, "pd": [8, 9, 10, 18], "seri": 8, "job": 8, "parallel": 8, "spars": 8, "feat_nam": 8, "convert": 8, "depend": 8, "last": 8, "step1": 8, "step2": 8, "add_feat_valu": 8, "dict_scal": 8, "convent": 8, "letter": 8, "feature_valu": 8, "n_part": 8, "ha": [8, 18], "structur": [8, 21], "th": 8, "n_split": 8, "p1": 8, "p2": 8, "pn": 8, "end": 8, "odd": 8, "even": 8, "give": 8, "add_dif": 8, "sample_nam": 8, "ref_group": 8, "add_posit": 8, "part_split": 8, "feat_posit": 8, "total": [8, 9], "n_compon": 9, "pca_kwarg": 9, "offer": [9, 18], "approach": 9, "pca": 9, "dimension": [9, 21], "iter": 9, "reliabl": [9, 18], "These": [9, 18], "those": 9, "distant": 9, "altern": 9, "distanc": 9, "manhattan": 9, "cosin": 9, "80": 9, "cover": 9, "varianc": 9, "identif": [9, 21], "datapoint": 9, "inspir": [9, 18], "techniqu": 9, "theoret": 9, "high": [9, 21], "n_neg": 9, "label_po": 9, "name_neg": 9, "rel_neg": 9, "col_class": 9, "newli": 9, "updat": [9, 18], "new": [9, 18], "store": 9, "Will": 9, "dure": 9, "initi": 9, "datafor": 9, "conta": 9, "po": 9, "unl": 9, "numpi": [9, 18], "np": 9, "atgc": 9, "gcta": 9, "actg": 9, "tacg": 9, "mode": 9, "modifi": [9, 10, 16], "dpul": 9, "info": 10, "non_canonical_aa": 10, "remov": [10, 16], "min_len": 10, "max_len": 10, "categor": 10, "overview": 10, "tabl": 10, "per": 10, "liter": 10, "keep": 10, "gap": [10, 14], "canon": 10, "do": 10, "replac": 10, "symbol": 10, "ref": 10, "just_aaindex": 11, "unclassified_in": 11, "through": 11, "scale_cat": 11, "relev": 11, "scale_classif": 11, "aaindex": [11, 21], "df": 11, "current": 12, "ut": 12, "plot_set": 12, "dict_scale_cat": [13, 14], "cppplot": [13, 14, 18], "respect": [13, 14, 17, 18, 19], "n_color": 14, "color_po": 14, "color_neg": 14, "color_cent": 14, "input": [14, 18], "hex": 14, "pct_gap": 14, "pct_center": 14, "palett": 14, "feat": 14, "ggplot": 14, "datagroup": 14, "dark": 14, "face": 14, "rgb": 14, "hl": 14, "husl": 14, "xkcd": 14, "latter": 14, "rang": 14, "sn": 14, "color_palett": 14, "light_palett": 14, "lighter": 14, "handl": 15, "list_cat": 15, "ncol": 15, "fontsiz": 15, "weight": [15, 21], "lw": 15, "edgecolor": 15, "return_handl": 15, "loc": 15, "upper": 15, "left": 15, "labelspac": 15, "columnspac": 15, "fontsize_legend": 15, "title_align_left": 15, "fontsize_weight": 15, "customiz": 15, "attach": 15, "item": 15, "coordin": 15, "text": [15, 16], "vertic": 15, "horizont": 15, "marker": 15, "directli": [15, 18], "finer": 15, "control": 15, "how": 15, "line2d": 15, "cat1": 15, "red": 15, "cat2": 15, "o": 15, "fig_format": 16, "pdf": 16, "font_scal": 16, "arial": 16, "change_s": 16, "weight_bold": 16, "adjust_el": 16, "short_tick": 16, "no_tick": 16, "no_ticks_i": 16, "short_ticks_i": 16, "no_ticks_x": 16, "short_ticks_x": 16, "configur": 16, "visual": [16, 18], "variou": [16, 18], "file": [16, 18], "save": 16, "make": [16, 18], "visibl": 16, "choos": 16, "san": 16, "serif": 16, "verdana": 16, "helvetica": 16, "dejavu": 16, "element": 16, "bold": 16, "layout": 16, "short": 16, "mark": 16, "global": 16, "pypi": 17, "conda": [17, 18], "forg": 17, "pip": [17, 18], "introduct": 17, "contribut": 17, "tutori": 17, "api": 17, "explain": [17, 18, 21], "ai": [17, 18, 21], "perturb": 17, "util": [17, 18], "search": 17, "your": [17, 18, 19], "work": [17, 19], "pleas": [17, 18, 19], "cite": [17, 19], "_": [17, 19], "breimann": [17, 19, 21], "kamp": [17, 19], "steiner": [17, 19], "frishman": [17, 19], "2023": [17, 19], "ontologi": [17, 19, 21], "machin": [17, 18, 19, 21], "biorxiv": [17, 19, 21], "welcom": 18, "thank": 18, "we": 18, "open": 18, "project": [18, 22], "focus": 18, "involv": 18, "invalu": 18, "made": 18, "wai": 18, "suggest": 18, "github": 18, "issu": 18, "tracker": 18, "submit": 18, "improv": [18, 21], "particip": 18, "discuss": 18, "newcom": 18, "tackl": 18, "good": 18, "email": 18, "stephanbreimann": 18, "gmail": 18, "com": 18, "question": 18, "establish": 18, "comprehens": 18, "toolkit": 18, "robust": 18, "life": 18, "scienc": 18, "integr": [18, 21], "seamlessli": 18, "flexibl": 18, "interoper": 18, "packag": 18, "biopython": 18, "reimplement": 18, "solut": 18, "ignor": 18, "biolog": [18, 20], "context": 18, "relianc": 18, "opaqu": 18, "box": 18, "empir": 18, "insight": 18, "cut": 18, "fair": 18, "account": 18, "transpar": 18, "re": [18, 21], "commit": 18, "divers": 18, "aspect": 18, "causal": 18, "minim": 18, "reproduc": 18, "mre": 18, "least": 18, "amount": 18, "demonstr": 18, "self": 18, "ensur": 18, "necessari": 18, "confirm": 18, "replic": 18, "guidelin": 18, "here": [18, 22], "To": 18, "git": 18, "breimanntool": 18, "master": 18, "repositori": 18, "your_usernam": 18, "navig": 18, "folder": 18, "up": 18, "cd": 18, "isol": 18, "aanalysi": 18, "9": 18, "activ": 18, "poetri": 18, "pytest": 18, "hypothesi": 18, "execut": 18, "case": 18, "directori": 18, "substanti": 18, "minor": 18, "typo": 18, "concis": 18, "clear": 18, "branch": 18, "fix": 18, "readm": 18, "date": 18, "readthedoc": 18, "crucial": 18, "modif": 18, "thei": 18, "render": 18, "correctli": 18, "strive": 18, "consist": [18, 20], "interfac": 18, "well": 18, "organ": 18, "codebas": 18, "standalon": 18, "focu": 18, "special": 18, "task": 18, "carri": 18, "out": 18, "complet": 18, "process": 18, "fulfil": 18, "purpos": 18, "being": 18, "implement": 18, "inherit": 18, "supplementari": 18, "accordingli": 18, "suffix": 18, "support": 18, "semi": 18, "strictli": 18, "adher": 18, "aforement": 18, "primari": 18, "_util": 18, "_utils_const": 18, "py": 18, "modular": 18, "easili": 18, "therefor": 18, "flat": 18, "hierarchi": 18, "program": 18, "outlin": 18, "softwar": 18, "user": 18, "friendli": 18, "hint": 18, "enhanc": 18, "propos": 18, "pep": 18, "484": 18, "book": 18, "error": 18, "messag": 18, "docstr": 18, "257": 18, "guid": 18, "markup": 18, "languag": 18, "restructuredtext": 18, "rst": 18, "primer": 18, "autodoc": 18, "sphinx": 18, "inclus": 18, "napoleon": 18, "extens": 18, "conf": 18, "go": 18, "_build": 18, "browser": 18, "citat": 19, "wa": 20, "develop": 20, "typic": 20, "et": 21, "al": 21, "2023a": 21, "2023b": 21, "2023c": 21, "chart": 21, "\u03b3": 21, "cheng": 21, "2006": 21, "larg": 21, "disulphid": 21, "kernel": 21, "recurs": 21, "neural": 21, "network": 21, "graph": 21, "match": 21, "struct": 21, "funct": 21, "kawashima": 21, "2008": 21, "aid": 21, "databas": 21, "report": 21, "nucleic": 21, "magnan": 21, "randal": 21, "baldi": 21, "2009": 21, "accur": 21, "bioinformat": 21, "galiez": 21, "2016": 21, "viral": 21, "song": 21, "2018": 21, "throughput": 21, "90": 21, "proteas": 21, "accuraci": 21, "shen": 21, "2019": 21, "local": 21, "evolutionari": 21, "chou": 21, "pseaac": 21, "j": 21, "theor": 21, "biol": 21, "tang": 21, "2020": 21, "teng": 21, "2021": 21, "pseudo": 21, "composit": 21, "tripeptid": 21, "bmc": 21, "yang": 21, "granular": 21, "multipl": 21, "appl": 21, "chronolog": 22, "histori": 22, "overview_benchmark": 22, "overview_scal": 22, "view": 24}, "objects": {"aaanalysis": [[5, 0, 1, "", "AAclust"], [6, 0, 1, "", "CPP"], [7, 0, 1, "", "CPPPlot"], [8, 0, 1, "", "SequenceFeature"], [9, 0, 1, "", "dPULearn"], [10, 3, 1, "", "load_dataset"], [11, 3, 1, "", "load_scales"], [12, 3, 1, "", "plot_gcfs"], [13, 3, 1, "", "plot_get_cdict"], [14, 3, 1, "", "plot_get_cmap"], [15, 3, 1, "", "plot_set_legend"], [16, 3, 1, "", "plot_settings"]], "aaanalysis.AAclust": [[5, 1, 1, "", "__init__"], [5, 2, 1, "", "center_labels_"], [5, 2, 1, "", "centers_"], [5, 1, 1, "", "cluster_naming"], [5, 1, 1, "", "correlation"], [5, 1, 1, "", "eval"], [5, 1, 1, "", "fit"], [5, 1, 1, "", "get_cluster_centers"], [5, 1, 1, "", "get_cluster_medoids"], [5, 2, 1, "", "labels_"], [5, 2, 1, "", "medoid_ind_"], [5, 2, 1, "", "medoid_labels_"], [5, 2, 1, "", "medoids_"], [5, 2, 1, "", "n_clusters"]], "aaanalysis.CPP": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "eval"], [6, 1, 1, "", "run"]], "aaanalysis.CPPPlot": [[7, 1, 1, "", "__init__"], [7, 1, 1, "", "heatmap"], [7, 1, 1, "", "profile"], [7, 1, 1, "", "update_seq_size"]], "aaanalysis.SequenceFeature": [[8, 1, 1, "", "__init__"], [8, 1, 1, "", "add_dif"], [8, 1, 1, "", "add_feat_value"], [8, 1, 1, "", "add_position"], [8, 1, 1, "", "feat_matrix"], [8, 1, 1, "", "feat_names"], [8, 1, 1, "", "get_df_parts"], [8, 1, 1, "", "get_features"], [8, 1, 1, "", "get_split_kws"]], "aaanalysis.dPULearn": [[9, 1, 1, "", "__init__"], [9, 1, 1, "", "eval"], [9, 1, 1, "", "fit"], [9, 2, 1, "", "labels_"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:attribute", "3": "py:function"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "function", "Python function"]}, "titleterms": {"tabl": [2, 17, 22], "overview": [2, 17, 22], "protein": [2, 22, 24], "benchmark": [2, 21, 22], "dataset": [2, 21, 22], "amino": 2, "acid": 2, "scale": 2, "data": [3, 4, 24], "load": [3, 24], "api": 4, "featur": [4, 24], "engin": [4, 24], "pu": 4, "learn": 4, "explain": 4, "ai": 4, "perturb": 4, "plot": 4, "util": 4, "aaanalysi": [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], "aaclust": 5, "note": [5, 6, 8, 9, 10, 16], "cpp": 6, "cppplot": 7, "exampl": [7, 8, 9, 15, 16, 17], "sequencefeatur": 8, "dpulearn": 9, "load_dataset": 10, "load_scal": 11, "plot_gcf": 12, "plot_get_cdict": 13, "plot_get_cmap": 14, "plot_set_legend": 15, "plot_set": 16, "welcom": 17, "document": [17, 18], "instal": [17, 18], "refer": [17, 21], "indic": 17, "citat": 17, "contribut": 18, "introduct": [18, 20], "vision": 18, "object": 18, "non": 18, "goal": 18, "principl": [18, 23], "bug": 18, "report": 18, "latest": 18, "version": 18, "local": 18, "develop": 18, "environ": 18, "fork": 18, "clone": 18, "depend": 18, "run": 18, "unit": 18, "test": 18, "pull": 18, "request": 18, "preview": 18, "chang": 18, "name": 18, "convent": 18, "class": 18, "templat": 18, "function": 18, "method": 18, "code": 18, "philosophi": 18, "style": 18, "build": 18, "doc": 18, "workflow": 20, "algorithm": 21, "us": 21, "case": 21, "further": 21, "inform": 21, "usag": 23, "tutori": 24, "redund": 24, "reduct": 24, "identif": 24, "neg": 24, "predict": 24}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx": 57}, "alltitles": {"Tables": [[2, "tables"], [22, "tables"]], "Overview Table": [[2, "overview-table"], [22, "overview-table"]], "Protein benchmark datasets": [[2, "protein-benchmark-datasets"], [22, "protein-benchmark-datasets"]], "Amino acid scale datasets": [[2, "amino-acid-scale-datasets"]], "Data Loading": [[3, "data-loading"]], "API": [[4, "api"]], "Data": [[4, "data"]], "Feature Engineering": [[4, "feature-engineering"]], "PU Learning": [[4, "pu-learning"]], "Explainable AI": [[4, "explainable-ai"]], "Perturbation": [[4, "perturbation"]], "Plot Utilities": [[4, "plot-utilities"]], "aaanalysis.AAclust": [[5, "aaanalysis-aaclust"]], "Notes": [[5, null], [6, null], [6, null], [8, null], [8, null], [8, null], [8, null], [8, null], [9, null], [9, null], [10, null], [16, null]], "aaanalysis.CPP": [[6, "aaanalysis-cpp"]], "aaanalysis.CPPPlot": [[7, "aaanalysis-cppplot"]], "Examples": [[7, null], [8, null], [8, null], [9, null], [15, null], [16, null]], "aaanalysis.SequenceFeature": [[8, "aaanalysis-sequencefeature"]], "aaanalysis.dPULearn": [[9, "aaanalysis-dpulearn"]], "aaanalysis.load_dataset": [[10, "aaanalysis-load-dataset"]], "aaanalysis.load_scales": [[11, "aaanalysis-load-scales"]], "aaanalysis.plot_gcfs": [[12, "aaanalysis-plot-gcfs"]], "aaanalysis.plot_get_cdict": [[13, "aaanalysis-plot-get-cdict"]], "aaanalysis.plot_get_cmap": [[14, "aaanalysis-plot-get-cmap"]], "aaanalysis.plot_set_legend": [[15, "aaanalysis-plot-set-legend"]], "aaanalysis.plot_settings": [[16, "aaanalysis-plot-settings"]], "Welcome to the AAanalysis documentation": [[17, "welcome-to-the-aaanalysis-documentation"]], "Install": [[17, "install"]], "OVERVIEW": [[17, null]], "EXAMPLES": [[17, null]], "REFERENCES": [[17, null]], "Indices and tables": [[17, "indices-and-tables"]], "Citation": [[17, "citation"]], "Contributing": [[18, "contributing"]], "Introduction": [[18, "introduction"], [20, "introduction"]], "Vision": [[18, "vision"]], "Objectives": [[18, "objectives"]], "Non-goals": [[18, "non-goals"]], "Principles": [[18, "principles"]], "Bug Reports": [[18, "bug-reports"]], "Installation": [[18, "installation"]], "Latest Version": [[18, "latest-version"]], "Local Development Environment": [[18, "local-development-environment"]], "Fork and Clone": [[18, "fork-and-clone"]], "Install Dependencies": [[18, "install-dependencies"]], "Run Unit Tests": [[18, "run-unit-tests"]], "Pull Requests": [[18, "pull-requests"]], "Preview Changes": [[18, "preview-changes"]], "Documentation": [[18, "documentation"]], "Naming Conventions": [[18, "naming-conventions"]], "Class Templates": [[18, "class-templates"]], "Function and Method Naming": [[18, "function-and-method-naming"]], "Code Philosophy": [[18, "code-philosophy"]], "Documentation Style": [[18, "documentation-style"]], "Building the Docs": [[18, "building-the-docs"]], "Workflow": [[20, "workflow"]], "References": [[21, "references"]], "Algorithms": [[21, "algorithms"]], "Datasets and Benchmarks": [[21, "datasets-and-benchmarks"]], "Use Cases": [[21, "use-cases"]], "Further Information": [[21, "further-information"]], "Usage Principles": [[23, "usage-principles"]], "Tutorials": [[24, "tutorials"]], "Data loading": [[24, "data-loading"]], "Redundancy-reduction": [[24, "redundancy-reduction"]], "Feature engineering": [[24, "feature-engineering"]], "Identification of negatives": [[24, "identification-of-negatives"]], "Protein prediction": [[24, "protein-prediction"]]}, "indexentries": {"aaclust (class in aaanalysis)": [[5, "aaanalysis.AAclust"]], "__init__() (aaanalysis.aaclust method)": [[5, "aaanalysis.AAclust.__init__"]], "center_labels_ (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.center_labels_"]], "centers_ (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.centers_"]], "cluster_naming() (aaanalysis.aaclust method)": [[5, "aaanalysis.AAclust.cluster_naming"]], "correlation() (aaanalysis.aaclust static method)": [[5, "aaanalysis.AAclust.correlation"]], "eval() (aaanalysis.aaclust method)": [[5, "aaanalysis.AAclust.eval"]], "fit() (aaanalysis.aaclust method)": [[5, "aaanalysis.AAclust.fit"]], "get_cluster_centers() (aaanalysis.aaclust static method)": [[5, "aaanalysis.AAclust.get_cluster_centers"]], "get_cluster_medoids() (aaanalysis.aaclust static method)": [[5, "aaanalysis.AAclust.get_cluster_medoids"]], "labels_ (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.labels_"]], "medoid_ind_ (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.medoid_ind_"]], "medoid_labels_ (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.medoid_labels_"]], "medoids_ (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.medoids_"]], "n_clusters (aaanalysis.aaclust attribute)": [[5, "aaanalysis.AAclust.n_clusters"]], "cpp (class in aaanalysis)": [[6, "aaanalysis.CPP"]], "__init__() (aaanalysis.cpp method)": [[6, "aaanalysis.CPP.__init__"]], "eval() (aaanalysis.cpp static method)": [[6, "aaanalysis.CPP.eval"]], "run() (aaanalysis.cpp method)": [[6, "aaanalysis.CPP.run"]], "cppplot (class in aaanalysis)": [[7, "aaanalysis.CPPPlot"]], "__init__() (aaanalysis.cppplot method)": [[7, "aaanalysis.CPPPlot.__init__"]], "heatmap() (aaanalysis.cppplot method)": [[7, "aaanalysis.CPPPlot.heatmap"]], "profile() (aaanalysis.cppplot method)": [[7, "aaanalysis.CPPPlot.profile"]], "update_seq_size() (aaanalysis.cppplot method)": [[7, "aaanalysis.CPPPlot.update_seq_size"]], "sequencefeature (class in aaanalysis)": [[8, "aaanalysis.SequenceFeature"]], "__init__() (aaanalysis.sequencefeature method)": [[8, "aaanalysis.SequenceFeature.__init__"]], "add_dif() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.add_dif"]], "add_feat_value() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.add_feat_value"]], "add_position() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.add_position"]], "feat_matrix() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.feat_matrix"]], "feat_names() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.feat_names"]], "get_df_parts() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.get_df_parts"]], "get_features() (aaanalysis.sequencefeature method)": [[8, "aaanalysis.SequenceFeature.get_features"]], "get_split_kws() (aaanalysis.sequencefeature static method)": [[8, "aaanalysis.SequenceFeature.get_split_kws"]], "__init__() (aaanalysis.dpulearn method)": [[9, "aaanalysis.dPULearn.__init__"]], "dpulearn (class in aaanalysis)": [[9, "aaanalysis.dPULearn"]], "eval() (aaanalysis.dpulearn method)": [[9, "aaanalysis.dPULearn.eval"]], "fit() (aaanalysis.dpulearn method)": [[9, "aaanalysis.dPULearn.fit"]], "labels_ (aaanalysis.dpulearn attribute)": [[9, "aaanalysis.dPULearn.labels_"]], "load_dataset() (in module aaanalysis)": [[10, "aaanalysis.load_dataset"]], "load_scales() (in module aaanalysis)": [[11, "aaanalysis.load_scales"]], "plot_gcfs() (in module aaanalysis)": [[12, "aaanalysis.plot_gcfs"]], "plot_get_cdict() (in module aaanalysis)": [[13, "aaanalysis.plot_get_cdict"]], "plot_get_cmap() (in module aaanalysis)": [[14, "aaanalysis.plot_get_cmap"]], "plot_set_legend() (in module aaanalysis)": [[15, "aaanalysis.plot_set_legend"]], "plot_settings() (in module aaanalysis)": [[16, "aaanalysis.plot_settings"]]}}) \ No newline at end of file diff --git a/docs/build/html/tutorials.html b/docs/build/html/tutorials.html index 795f741e..42faf751 100644 --- a/docs/build/html/tutorials.html +++ b/docs/build/html/tutorials.html @@ -104,7 +104,7 @@
  • - View page source + Edit on GitHub

  • diff --git a/docs/source/_static/css/style.css b/docs/source/_static/css/style.css index b8bfd848..b6f89b7b 100755 --- a/docs/source/_static/css/style.css +++ b/docs/source/_static/css/style.css @@ -32,6 +32,13 @@ background: #f7f7f7; /* Sets the background color */ } +/* Style for inline code */ +.rst-content code { + background-color: #f5f5f5; /* Gray background */ + font-family: monospace; /* Monospace font */ + padding: 2px 2px; /* Padding around the text */ +} + /* Style for the search input box in the sidebar */ .wy-side-nav-search input[type=text] { border-color: #666666; /* Sets the border color */ @@ -67,4 +74,4 @@ html.writer-html4 .rst-content dl:not(.docutils)>dt, html.writer-html5 .rst-cont /* Style for vertical menu items */ .wy-menu-vertical a { color: #d9d9d9; /* Sets the text color */ -} +} \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 6706575d..13fa651e 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -43,7 +43,6 @@ 'sphinx.ext.autodoc', # Autogenerate documentation from docstrings #'numpydoc', # Support for Numpy-style docstrings 'sphinx.ext.autosummary', # Generate summary tables for API reference - 'sphinx.ext.viewcode', # Link from docs to source code 'sphinx_rtd_theme', # Theme emulating "Read the Docs" style # "sphinx_book_theme" 'sphinx_copybutton', # Adds a "copy" button to code blocks 'sphinx.ext.intersphinx', # Links to documentation of objects in other Sphinx projects @@ -146,6 +145,15 @@ html_show_sphinx = False html_logo = "_artwork/logo_big_trans.png" html_favicon = "_artwork/logo_small.png" + +html_context = { + 'display_github': True, # Add the 'Edit on GitHub' link + 'github_user': 'breimanntools', + 'github_repo': 'aaanalysis', + 'github_version': 'master/docs/source/', +} + + """ html_favicon = "path_to_your_favicon.ico" htmlhelp_basename = "YOUR_PROJECT_NAMEdoc"