diff --git a/aaanalysis/__pycache__/utils.cpython-39.pyc b/aaanalysis/__pycache__/utils.cpython-39.pyc index 7bd85d19..ce7d9e8b 100644 Binary files a/aaanalysis/__pycache__/utils.cpython-39.pyc and b/aaanalysis/__pycache__/utils.cpython-39.pyc differ diff --git a/aaanalysis/_utils/__init__.py b/aaanalysis/_utils/__init__.py index e69de29b..8b137891 100644 --- a/aaanalysis/_utils/__init__.py +++ b/aaanalysis/_utils/__init__.py @@ -0,0 +1 @@ + diff --git a/aaanalysis/_utils/__pycache__/__init__.cpython-39.pyc b/aaanalysis/_utils/__pycache__/__init__.cpython-39.pyc index ec36d995..a61deb51 100644 Binary files a/aaanalysis/_utils/__pycache__/__init__.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/__init__.cpython-39.pyc differ diff --git a/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc b/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc index 07d15af7..1c0a77f9 100644 Binary files a/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc differ diff --git a/aaanalysis/_utils/__pycache__/_utils_constants.cpython-39.pyc b/aaanalysis/_utils/__pycache__/_utils_constants.cpython-39.pyc deleted file mode 100644 index 7bd6f4e7..00000000 Binary files a/aaanalysis/_utils/__pycache__/_utils_constants.cpython-39.pyc and /dev/null differ diff --git a/aaanalysis/_utils/__pycache__/_utils_output.cpython-39.pyc b/aaanalysis/_utils/__pycache__/_utils_output.cpython-39.pyc index 98d80c94..f93d2f5d 100644 Binary files a/aaanalysis/_utils/__pycache__/_utils_output.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/_utils_output.cpython-39.pyc differ diff --git a/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc b/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc index 59668b2c..e38ecd09 100644 Binary files a/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc differ diff --git a/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc b/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc index 6b292095..b7919e8f 100644 Binary files a/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc differ diff --git a/aaanalysis/_utils/_utils_constants.py b/aaanalysis/_utils/_utils_constants.py deleted file mode 100644 index 30b103e1..00000000 --- a/aaanalysis/_utils/_utils_constants.py +++ /dev/null @@ -1,91 +0,0 @@ -""" -This is a script containing constant names, column names, or colors. - -AAanalysis comprises these primary pd.DataFrames: df_seq, df_part, df_cat, df_scales, df_feat - -""" - -# Default scale datasets for protein analysis -STR_SCALES = "scales" # Min-max normalized scales (from AAontology) -STR_SCALES_RAW = "scales_raw" # Raw scales (from AAontology) -STR_SCALES_PC = "scales_pc" # AAclust pc-based scales (pc: principal component) -STR_SCALE_CAT = "scales_cat" # AAontology -STR_TOP60 = "top60" # AAclustTop60 -STR_TOP60_EVAL = "top60_eval" # AAclustTop60 evaluation -NAMES_SCALE_SETS = [STR_SCALES, STR_SCALES_RAW, STR_SCALE_CAT, - STR_SCALES_PC, STR_TOP60, STR_TOP60_EVAL] - - -# Column names for primary df -# df_seq -COL_ENTRY = "entry" # ACC, protein entry, uniprot id -COL_NAME = "name" # Entry name, Protein name, Uniprot Name -COL_LABEL = "label" -COL_SEQ = "sequence" -COLS_PARTS = ["jmd_n", "tmd", "jmd_c"] -COL_TMD_START = "tmd_start" -COL_TMD_STOP = "tmd_stop" -COLS_SEQ_KEY = [COL_ENTRY, COL_SEQ, COL_LABEL] -COLS_SEQ_TMD_POS_KEY = [COL_SEQ, COL_TMD_START, COL_TMD_STOP] # TODO adjust to COL_ENTRY -COLS_SEQ_TMD_PART_KEY = [COL_ENTRY, COL_SEQ] + COLS_PARTS -# df_part - -# df_scales -# Column for df_cat (as defined in AAontology, retrieved by aa.load_scales(name="scale_cat")) -COL_SCALE_ID = "scale_id" -COL_CAT = "category" -COL_SUBCAT = "subcategory" -COL_SCALE_NAME = "scale_name" -COL_SCALE_DES = "scale_description" - - -# Columns for df_feat -COL_FEATURE = "feature" -# COL_CAT, COL_SUBCAT, COL_SCALE_NAME, COL_SCALE_DES -COL_ABS_AUC = "abs_auc" -COL_ABS_MEAN_DIF = "abs_mean_dif" -COL_MEAN_DIF = "mean_dif" -COL_STD_TEST = "std_test" -COL_STD_REF = "std_ref" -COL_PVAL_MW = "p_val_mann_whitney" -COL_PVAL_FDR = "p_val_fdr_bh" -COL_POSITION = "positions" - -# Columns for df_feat after processing with explainable AI methods -COL_FEAT_IMPORTANCE = "feat_importance" -COO_FEAT_IMP_STD = "feat_importance_std" -COL_FEAT_IMPACT = "feat_impact" - - -# Column name datasets (DOM_GSEC) - - - -# Column names cpp features - - -# Standard colors -COLOR_SHAP_POS = '#FF0D57' # (255, 13, 87) -COLOR_SHAP_NEG = '#1E88E5' # (30, 136, 229) -COLOR_FEAT_POS = '#9D2B39' # (157, 43, 57) Mean difference -COLOR_FEAT_NEG = '#326599' # (50, 101, 133) Mean difference -COLOR_FEAT_IMP = '#7F7F7F' # (127, 127, 127) feature importance -COLOR_TMD = '#00FA9A' # (0, 250, 154) -COLOR_JMD = '#0000FF' # (0, 0, 255) - -DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS, - "SHAP_NEG": COLOR_SHAP_NEG, - "FEAT_POS": COLOR_FEAT_POS, - "FEAT_NEG": COLOR_FEAT_NEG, - "FEAT_IMP": COLOR_FEAT_IMP, - "TMD": COLOR_TMD, - "JMD": COLOR_JMD} - -DICT_COLOR_CAT = {"ASA/Volume": "tab:blue", - "Composition": "tab:orange", - "Conformation": "tab:green", - "Energy": "tab:red", - "Others": "tab:gray", - "Polarity": "gold", - "Shape": "tab:cyan", - "Structure-Activity": "tab:brown"} \ No newline at end of file diff --git a/aaanalysis/_utils/utils_aaclust.py b/aaanalysis/_utils/utils_aaclust.py index 01038ca6..26c45173 100644 --- a/aaanalysis/_utils/utils_aaclust.py +++ b/aaanalysis/_utils/utils_aaclust.py @@ -7,7 +7,6 @@ METRIC_CORRELATION = "correlation" LIST_METRICS = [METRIC_CORRELATION, "manhattan", "euclidean", "cosine"] - # Check functions def check_model(model=None, model_kwargs=None, except_None=True): """""" diff --git a/aaanalysis/_utils/utils_cpp.py b/aaanalysis/_utils/utils_cpp.py index d9ca36ae..e701eb11 100644 --- a/aaanalysis/_utils/utils_cpp.py +++ b/aaanalysis/_utils/utils_cpp.py @@ -2,13 +2,10 @@ This is a script with utility functions and settings for CPP project. """ import numpy as np -import pandas as pd import matplotlib.colors as mcolors import matplotlib.pyplot as plt -import aaanalysis._utils._utils_constants as ut_c import aaanalysis._utils._utils_check as ut_check -import aaanalysis._utils._utils_output as ut_o # Settings @@ -104,7 +101,7 @@ def _check_seq(seq, len_, name_seq, name_len): raise ValueError(f"The length of {seq} ({len(seq)}) should be >= {name_len} ({len_}).") return len(seq) - +# TODO split in two (separation of concerns) def check_args_len(tmd_len=None, jmd_n_len=None, jmd_c_len=None, ext_len=None, tmd_seq=None, jmd_n_seq=None, jmd_c_seq=None, accept_tmd_none=False): """Check length parameters and if they are matching with sequences if provided""" @@ -116,6 +113,7 @@ def check_args_len(tmd_len=None, jmd_n_len=None, jmd_c_len=None, ext_len=None, ut_check.check_non_negative_number(name="ext_len", val=ext_len, accept_none=True) # Check if lengths and sequences match tmd_len = _check_seq(tmd_seq, tmd_len, "tmd_seq", "tmd_len") + print(jmd_n_seq) jmd_n_len = _check_seq(jmd_n_seq, jmd_n_len, "jmd_n_seq", "jmd_n_len") jmd_c_len = _check_seq(jmd_c_seq, jmd_c_len, "jmd_c_seq", "jmd_c_len") # Check if lengths are matching @@ -269,7 +267,7 @@ def check_split(split=None): # Scale functions def get_dict_all_scales(df_scales=None): - """Get nested dictionary where each scales is a key for a amino acid scale value dictionary""" + """Get nested dictionary where each scale is a key for an amino acid scale value dictionary""" dict_all_scales = {col: dict(zip(df_scales.index.to_list(), df_scales[col])) for col in list(df_scales)} return dict_all_scales diff --git a/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc b/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc index 192664dc..27cd1216 100644 Binary files a/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc and b/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc differ diff --git a/aaanalysis/cpp/feature.py b/aaanalysis/cpp/feature.py index 1dc1e13c..9f6df6a0 100644 --- a/aaanalysis/cpp/feature.py +++ b/aaanalysis/cpp/feature.py @@ -164,7 +164,7 @@ def _feature_matrix(feat_names, dict_all_scales, df_parts, accept_gaps): accept_gaps=accept_gaps) return feat_matrix - + # II Main Functions class SequenceFeature: """Retrieve and create sequence feature components (Part, Split, and Scale). diff --git a/aaanalysis/utils.py b/aaanalysis/utils.py index 0a3f6736..d499a5ef 100644 --- a/aaanalysis/utils.py +++ b/aaanalysis/utils.py @@ -4,16 +4,22 @@ import os import platform from functools import lru_cache +import pandas as pd +import numpy as np -# Import utility functions for specific purposes -from aaanalysis._utils._utils_constants import * -from aaanalysis._utils._utils_check import * -from aaanalysis._utils._utils_output import * - -# Import utility function for specific modules -from aaanalysis._utils.utils_aaclust import * -from aaanalysis._utils.utils_cpp import * - +# Import utility functions explicitly +from aaanalysis._utils._utils_check import (check_non_negative_number, check_float, check_str, check_bool, + check_dict, check_tuple, + check_feat_matrix, check_col_in_df) +from aaanalysis._utils._utils_output import (print_red, print_start_progress, print_progress, print_finished_progress) +from aaanalysis._utils.utils_aaclust import (check_model, check_min_th, check_merge_metric, + METRIC_CORRELATION, LIST_METRICS) +from aaanalysis._utils.utils_cpp import (check_color, check_y_categorical, check_labels, check_ylim, + check_args_len, check_list_parts, check_split_kws, check_split, + get_dict_all_scales, get_vf_scale, + STR_SEGMENT, STR_PATTERN, STR_PERIODIC_PATTERN, STR_AA_GAP, + LIST_PARTS, LIST_ALL_PARTS, SPLIT_DESCRIPTION) +#from aaanalysis.utils.utils_dpulearn import () # I Folder structure def _folder_path(super_folder, folder_name): @@ -28,7 +34,95 @@ def _folder_path(super_folder, folder_name): URL_DATA = "https://github.com/breimanntools/aaanalysis/tree/master/aaanalysis/data/" +# Constants +# Default scale datasets for protein analysis +STR_SCALES = "scales" # Min-max normalized scales (from AAontology) +STR_SCALES_RAW = "scales_raw" # Raw scales (from AAontology) +STR_SCALES_PC = "scales_pc" # AAclust pc-based scales (pc: principal component) +STR_SCALE_CAT = "scales_cat" # AAontology +STR_TOP60 = "top60" # AAclustTop60 +STR_TOP60_EVAL = "top60_eval" # AAclustTop60 evaluation +NAMES_SCALE_SETS = [STR_SCALES, STR_SCALES_RAW, STR_SCALE_CAT, + STR_SCALES_PC, STR_TOP60, STR_TOP60_EVAL] + + +# Column names for primary df +# df_seq +COL_ENTRY = "entry" # ACC, protein entry, uniprot id +COL_NAME = "name" # Entry name, Protein name, Uniprot Name +COL_LABEL = "label" +COL_SEQ = "sequence" +COLS_PARTS = ["jmd_n", "tmd", "jmd_c"] +COL_TMD_START = "tmd_start" +COL_TMD_STOP = "tmd_stop" +COLS_SEQ_KEY = [COL_ENTRY, COL_SEQ, COL_LABEL] +COLS_SEQ_TMD_POS_KEY = [COL_SEQ, COL_TMD_START, COL_TMD_STOP] # TODO adjust to COL_ENTRY +COLS_SEQ_TMD_PART_KEY = [COL_ENTRY, COL_SEQ] + COLS_PARTS +# df_part + +# df_scales +# Column for df_cat (as defined in AAontology, retrieved by aa.load_scales(name="scale_cat")) +COL_SCALE_ID = "scale_id" +COL_CAT = "category" +COL_SUBCAT = "subcategory" +COL_SCALE_NAME = "scale_name" +COL_SCALE_DES = "scale_description" + + +# Columns for df_feat +COL_FEATURE = "feature" +# COL_CAT, COL_SUBCAT, COL_SCALE_NAME, COL_SCALE_DES +COL_ABS_AUC = "abs_auc" +COL_ABS_MEAN_DIF = "abs_mean_dif" +COL_MEAN_DIF = "mean_dif" +COL_STD_TEST = "std_test" +COL_STD_REF = "std_ref" +COL_PVAL_MW = "p_val_mann_whitney" +COL_PVAL_FDR = "p_val_fdr_bh" +COL_POSITION = "positions" + +# Columns for df_feat after processing with explainable AI methods +COL_FEAT_IMPORTANCE = "feat_importance" +COO_FEAT_IMP_STD = "feat_importance_std" +COL_FEAT_IMPACT = "feat_impact" + + +# Column name datasets (DOM_GSEC) + + + +# Column names cpp features + + +# Standard colors +COLOR_SHAP_POS = '#FF0D57' # (255, 13, 87) +COLOR_SHAP_NEG = '#1E88E5' # (30, 136, 229) +COLOR_FEAT_POS = '#9D2B39' # (157, 43, 57) Mean difference +COLOR_FEAT_NEG = '#326599' # (50, 101, 133) Mean difference +COLOR_FEAT_IMP = '#7F7F7F' # (127, 127, 127) feature importance +COLOR_TMD = '#00FA9A' # (0, 250, 154) +COLOR_JMD = '#0000FF' # (0, 0, 255) + +DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS, + "SHAP_NEG": COLOR_SHAP_NEG, + "FEAT_POS": COLOR_FEAT_POS, + "FEAT_NEG": COLOR_FEAT_NEG, + "FEAT_IMP": COLOR_FEAT_IMP, + "TMD": COLOR_TMD, + "JMD": COLOR_JMD} + +DICT_COLOR_CAT = {"ASA/Volume": "tab:blue", + "Composition": "tab:orange", + "Conformation": "tab:green", + "Energy": "tab:red", + "Others": "tab:gray", + "Polarity": "gold", + "Shape": "tab:cyan", + "Structure-Activity": "tab:brown"} + + # II MAIN FUNCTIONS +# Main Helper functions # Caching for data loading for better performance (data loaded ones) @lru_cache(maxsize=None) def read_excel_cached(name, index_col=None): @@ -43,32 +137,33 @@ def read_csv_cached(name, sep=None): return df.copy() +# Main check functions # Check key dataframes using constants and general checking functions (df_seq, df_parts, df_cat, df_scales, df_feat) def check_df_seq(df_seq=None, jmd_n_len=None, jmd_c_len=None): """Get features from df""" # TODO check if df_seq is None or not isinstance(df_seq, pd.DataFrame): raise ValueError("Type of 'df_seq' ({}) must be pd.DataFrame".format(type(df_seq))) - if ut_c.COL_ENTRY not in list(df_seq): - raise ValueError("'{}' must be in 'df_seq'".format(ut_c.COL_ENTRY)) - seq_info_in_df = set(ut_c.COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq)) - parts_in_df = set(ut_c.COLS_PARTS).issubset(set(df_seq)) - seq_in_df = ut_c.COL_SEQ in set(df_seq) + if COL_ENTRY not in list(df_seq): + raise ValueError("'{}' must be in 'df_seq'".format(COL_ENTRY)) + seq_info_in_df = set(COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq)) + parts_in_df = set(COLS_PARTS).issubset(set(df_seq)) + seq_in_df = COL_SEQ in set(df_seq) if "start" in list(df_seq): - raise ValueError(f"'df_seq' should not contain 'start' in columns. Change column to '{ut_c.COL_TMD_START}'.") + raise ValueError(f"'df_seq' should not contain 'start' in columns. Change column to '{COL_TMD_START}'.") if "stop" in list(df_seq): - raise ValueError(f"'df_seq' should not contain 'stop' in columns. Change column to '{ut_c.COL_TMD_STOP}'.") + raise ValueError(f"'df_seq' should not contain 'stop' in columns. Change column to '{COL_TMD_STOP}'.") if not (seq_info_in_df or parts_in_df or seq_in_df): - raise ValueError(f"'df_seq' should contain ['{ut_c.COL_SEQ}'], {ut_c.COLS_SEQ_TMD_POS_KEY}, or {ut_c.COLS_PARTS}") + raise ValueError(f"'df_seq' should contain ['{COL_SEQ}'], {COLS_SEQ_TMD_POS_KEY}, or {COLS_PARTS}") # Check data type in part or sequence columns else: if seq_info_in_df or seq_in_df: - error = f"Sequence column ('{ut_c.COL_SEQ}') should only contain strings" - dict_wrong_seq = {ut_c.COL_SEQ: [x for x in df_seq[ut_c.COL_SEQ].values if type(x) != str]} + error = f"Sequence column ('{COL_SEQ}') should only contain strings" + dict_wrong_seq = {COL_SEQ: [x for x in df_seq[COL_SEQ].values if type(x) != str]} else: - cols = ut_c.COLS_PARTS + cols = COLS_PARTS error = f"Part columns ('{cols}') should only contain strings" - dict_wrong_seq = {part: [x for x in df_seq[part].values if type(x) != str] for part in ut_c.COLS_PARTS} + dict_wrong_seq = {part: [x for x in df_seq[part].values if type(x) != str] for part in COLS_PARTS} # Filter empty lists dict_wrong_seq = {part: dict_wrong_seq[part] for part in dict_wrong_seq if len(dict_wrong_seq[part]) > 0} n_wrong_entries = sum([len(dict_wrong_seq[part]) for part in dict_wrong_seq]) @@ -78,33 +173,33 @@ def check_df_seq(df_seq=None, jmd_n_len=None, jmd_c_len=None): # Check if only sequence given -> Convert sequence to tmd if seq_in_df and not parts_in_df: if seq_info_in_df: - for entry, start, stop in zip(df_seq[ut_c.COL_ENTRY], df_seq[ut_c.COL_TMD_START], df_seq[ut_c.COL_TMD_STOP]): - ut_check.check_non_negative_number(name=f"tmd_start [{entry}]", val=start) - ut_check.check_non_negative_number(name=f"tmd_start [{entry}]", val=stop,) - tmd_start = [int(x) for x in df_seq[ut_c.COL_TMD_START]] - tmd_stop = [int(x) for x in df_seq[ut_c.COL_TMD_STOP]] + for entry, start, stop in zip(df_seq[COL_ENTRY], df_seq[COL_TMD_START], df_seq[COL_TMD_STOP]): + check_non_negative_number(name=f"tmd_start [{entry}]", val=start) + check_non_negative_number(name=f"tmd_start [{entry}]", val=stop,) + tmd_start = [int(x) for x in df_seq[COL_TMD_START]] + tmd_stop = [int(x) for x in df_seq[COL_TMD_STOP]] else: tmd_start = 1 if jmd_n_len is None else 1 + jmd_n_len - tmd_stop = [len(x)-1 for x in df_seq[ut_c.COL_SEQ]] + tmd_stop = [len(x)-1 for x in df_seq[COL_SEQ]] if jmd_c_len is not None: tmd_stop = [x - jmd_c_len for x in tmd_stop] - df_seq[ut_c.COL_TMD_START] = tmd_start - df_seq[ut_c.COL_TMD_STOP] = tmd_stop - seq_info_in_df = set(ut_c.COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq)) + df_seq[COL_TMD_START] = tmd_start + df_seq[COL_TMD_STOP] = tmd_stop + seq_info_in_df = set(COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq)) # Check parameter combinations if [jmd_n_len, jmd_c_len].count(None) == 1: raise ValueError("'jmd_n_len' and 'jmd_c_len' should both be given (not None) or None") if not parts_in_df and seq_info_in_df and jmd_n_len is None and jmd_c_len is None: error = f"'jmd_n_len' and 'jmd_c_len' should not be None if " \ - f"sequence information ({ut_c.COLS_SEQ_TMD_POS_KEY}) are given." + f"sequence information ({COLS_SEQ_TMD_POS_KEY}) are given." raise ValueError(error) if not seq_info_in_df and jmd_n_len is not None and jmd_c_len is not None: - error = f"If not all sequence information ({ut_c.COLS_SEQ_TMD_POS_KEY}) are given," \ + error = f"If not all sequence information ({COLS_SEQ_TMD_POS_KEY}) are given," \ f"'jmd_n_len' and 'jmd_c_len' should be None." raise ValueError(error) if not parts_in_df and seq_info_in_df and (jmd_c_len is None or jmd_n_len is None): error = "If part columns ({}) are not in 'df_seq' but sequence information ({}), " \ - "\n'jmd_n_len' and 'jmd_c_len' should be given (not None).".format(ut_c.COLS_PARTS, ut_c.COLS_SEQ_TMD_POS_KEY) + "\n'jmd_n_len' and 'jmd_c_len' should be given (not None).".format(COLS_PARTS, COLS_SEQ_TMD_POS_KEY) raise ValueError(error) return df_seq @@ -145,17 +240,17 @@ def check_df_cat(df_cat=None, df_scales=None, accept_none=True, verbose=True): if not isinstance(df_cat, pd.DataFrame): raise ValueError("'df_cat' should be type pd.DataFrame (not {})".format(type(df_cat))) # Check columns - for col in [ut_c.COL_SCALE_ID, ut_c.COL_CAT, ut_c.COL_SUBCAT]: + for col in [COL_SCALE_ID, COL_CAT, COL_SUBCAT]: if col not in df_cat: raise ValueError(f"'{col}' not in 'df_cat'") # Check scales from df_cat and df_scales do match if df_scales is not None: - scales_cat = list(df_cat[ut_c.COL_SCALE_ID]) + scales_cat = list(df_cat[COL_SCALE_ID]) scales = list(df_scales) overlap_scales = [x for x in scales if x in scales_cat] difference_scales = list(set(scales).difference(set(scales_cat))) # Adjust df_cat and df_scales - df_cat = df_cat[df_cat[ut_c.COL_SCALE_ID].isin(overlap_scales)] + df_cat = df_cat[df_cat[COL_SCALE_ID].isin(overlap_scales)] df_scales = df_scales[overlap_scales] if verbose and len(difference_scales) > 0: str_warning = f"Scales from 'df_scales' and 'df_cat' do not overlap completely." @@ -171,7 +266,7 @@ def check_df_cat(df_cat=None, df_scales=None, accept_none=True, verbose=True): def check_df_scales(df_scales=None, df_parts=None, accept_none=False, accept_gaps=False): """Check if df_scales is a valid input and matching to df_parts""" - ut_check.check_bool(name="accept_gaps", val=accept_gaps) + check_bool(name="accept_gaps", val=accept_gaps) if accept_none and df_scales is None: return # Skip check if not isinstance(df_scales, pd.DataFrame): @@ -223,16 +318,16 @@ def check_df_feat(df_feat=None, df_cat=None): if len(df_feat) == 0 or len(list(df_feat)) == 0: raise ValueError("'df_feat' should be not empty") # Check if feature column in df_feat - if ut_c.COL_FEATURE not in df_feat: - raise ValueError(f"'{ut_c.COL_FEATURE}' must be column in 'df_feat'") - list_feat = list(df_feat[ut_c.COL_FEATURE]) + if COL_FEATURE not in df_feat: + raise ValueError(f"'{COL_FEATURE}' must be column in 'df_feat'") + list_feat = list(df_feat[COL_FEATURE]) for feat in list_feat: if feat.count("-") != 2: raise ValueError(f"'{feat}' is no valid feature") # Check if df_feat matches df_cat if df_cat is not None: scales = set([x.split("-")[2] for x in list_feat]) - list_scales = list(df_cat[ut_c.COL_SCALE_ID]) + list_scales = list(df_cat[COL_SCALE_ID]) missing_scales = [x for x in scales if x not in list_scales] if len(missing_scales) > 0: raise ValueError(f"Following scales occur in 'df_feat' but not in 'df_cat': {missing_scales}") diff --git a/tests/_utils.py b/tests/_utils.py index 205b41f5..7dd20404 100644 --- a/tests/_utils.py +++ b/tests/_utils.py @@ -9,7 +9,7 @@ # Helper Function def _folder_path(super_folder, folder_name): - """Modification of separator (OS depending)""" + """Modification of separator (OS-depending)""" path = os.path.join(super_folder, folder_name + SEP) return path diff --git a/tests/unit/.hypothesis/unicode_data/13.0.0/codec-utf-8.json.gz b/tests/unit/.hypothesis/unicode_data/13.0.0/codec-utf-8.json.gz index 86a33b00..7df482fa 100644 Binary files a/tests/unit/.hypothesis/unicode_data/13.0.0/codec-utf-8.json.gz and b/tests/unit/.hypothesis/unicode_data/13.0.0/codec-utf-8.json.gz differ diff --git a/tests/unit/test_cpp_feature.py b/tests/unit/test_cpp_feature.py index 9bb77b8e..53be9b61 100644 --- a/tests/unit/test_cpp_feature.py +++ b/tests/unit/test_cpp_feature.py @@ -11,55 +11,6 @@ # I Unit Tests -class TestLoadScales: - """Unit test for loading scales""" - - # Positive unit test - def test_load_data(self): - """Unit test for aa.SequenceFeature().load_scales() method""" - sf = aa.SequenceFeature() - assert isinstance(sf.load_scales(clust_th=0.5), pd.DataFrame) - - # Negative test - def test_wrong_clustered_values(self): - sf = aa.SequenceFeature() - for i in [0.1, -0.2, "a", None]: - with pytest.raises(ValueError): - sf.load_scales(clust_th=i) - - # Property-based testing - @given(clustered=some.floats(min_value=-10, max_value=10)) - def test_clustered_integer(self, clustered): - sf = aa.SequenceFeature() - if clustered not in [0.5, 0.7]: - with pytest.raises(ValueError): - sf.load_scales(clust_th=clustered) - - -class TestLoadCategories: - """Unit test for loading DataFrame with sequence categories""" - - # Positive unit test - def test_load_categories(self): - sf = aa.SequenceFeature() - assert isinstance(aa.load_scales(clust_th=0.5), pd.DataFrame) - - # Negative test - def test_wrong_clustered_values(self): - sf = aa.SequenceFeature() - for i in [0.1, -0.2, "a", None]: - with pytest.raises(ValueError): - aa.load_scales(clust_th=i) - - # Property-based testing - @given(clustered=some.floats(min_value=-10, max_value=10)) - def test_clustered_integer(self, clustered): - sf = aa.SequenceFeature() - if clustered not in [0.5, 0.7]: - with pytest.raises(ValueError): - aa.load_scales(clust_th=clustered) - - class TestGetDfParts: """Unit test for loading DataFrame with sequence parts""" @@ -407,10 +358,10 @@ def test_sequence_feature(list_splits): """Positive regression/functional test of all aa.SequenceFeature() methods""" sf = aa.SequenceFeature() # Get test set of sequences - df_seq = sf.load_sequences() + df_seq = aa.load_dataset() # Get feature components df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=False) - df_scales = sf.load_scales() + df_scales = aa.load_scales() split_kws = sf.get_split_kws() # Get features (names, values, matrix) features = sf.get_features()[0:100] diff --git a/tutorials/prelude_on_plotting.ipynb b/tutorials/prelude_on_plotting.ipynb new file mode 100644 index 00000000..29cd7437 --- /dev/null +++ b/tutorials/prelude_on_plotting.ipynb @@ -0,0 +1,35 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/tutorial1_quick_start.ipynb b/tutorials/tutorial1_quick_start.ipynb old mode 100755 new mode 100644 index 008c3d8d..13033a25 --- a/tutorials/tutorial1_quick_start.ipynb +++ b/tutorials/tutorial1_quick_start.ipynb @@ -1,22 +1,55 @@ { "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Quick Start with AAanalysis\n", + "Dive into the powerful capabilities of ``AAanalysis``—a Python framework dedicated to sequence-based, alignment-free protein prediction. In this tutorial, using gamma-secretase substrates and non-substrates as an example, we'll focus on extracting interpretable features from protein sequences using the ``AAclust`` and ``CPP`` models and how they can be harnessed for binary classification tasks.\n", + "\n", + "What You Will Learn:\n", + "- ``Loading Sequences and Scales``: How to easily load protein sequences and their amino acid scales.\n", + "- ``Feature Engineering``: Extract essential features using the ``AAclust`` and ``CPP`` models.\n", + "- ``Protein Prediction``: Make predictions using the RandomForest model.\n", + "- ``Explainable AI``: Interpret predictions at the group and individual levels by combining ``CPP`` with ``SHAP``.\n", + "\n", + "## 1. Loading Sequences and Scales\n", + "With AAanalysis, you have access to numerous benchmark datasets for protein sequence analysis. Using our γ-secretase substrates and non-substrates dataset as a hands-on example, you can effortlessly retrieve these datasets using the ``aa.load_dataset()`` function. Furthermore, amino acid scales, predominantly from AAindex, along with their hierarchical classification (known as ``AAontology``), are available at your fingertips with the ``aa.load_scales()`` function." + ], + "metadata": { + "collapsed": false + } + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "pycharm": { - "name": "#%%\n", - "is_executing": true + "name": "#%%\n" + }, + "ExecuteTime": { + "end_time": "2023-09-23T14:15:04.562034649Z", + "start_time": "2023-09-23T14:15:04.508201346Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": " entry sequence label tmd_start tmd_stop jmd_n tmd jmd_c\n0 Q14802 MQKVTLGLLVFLAGFPVLDANDLEDKNSPFYYDWHSLQVGGLICAG... 0 37 59 NSPFYYDWHS LQVGGLICAGVLCAMGIIIVMSA KCKCKFGQKS\n1 Q86UE4 MAARSWQDELAQQAEEGSARLREMLSVGLGFLRTELGLDLGLEPKR... 0 50 72 LGLEPKRYPG WVILVGTGALGLLLLFLLGYGWA AACAGARKKR\n2 Q969W9 MHRLMGVNSTAAAAAGQPNVSCTCNCKRSLFQSMEITELEFVQIII... 0 41 63 FQSMEITELE FVQIIIIVVVMMVMVVVITCLLS HYKLSARSFI\n3 P53801 MAPGVARGPTPYWRLRLGGAALLLLLIPVAAAQEPPGAACSQNTNK... 0 97 119 RWGVCWVNFE ALIITMSVVGGTLLLGIAICCCC CCRRKRSRKP\n4 Q8IUW5 MAPRALPGSAVLAAAVFVGGAVSSPLVAPDNGSSRTLHSRTETTPS... 0 59 81 NDTGNGHPEY IAYALVPVFFIMGLFGVLICHLL KKKGYRCTTE\n.. ... ... ... ... ... ... ... ...\n95 P15209 MSPWLKWHGPAMARLWGLCLLVLGFWRASLACPTSCKCSSARIWCT... 1 431 453 VADQSNREHL SVYAVVVIASVVGFCLLVMLLLL KLARHSKFGM\n96 Q86YL7 MWKVSALLFVLGSASLWVLAEGASTGQPEDDTETTGLEGGVAMPGA... 1 130 152 TVEKDGLSTV TLVGIIVGVLLAIGFIGAIIVVV MRKMSGRYSP\n97 Q13308 MGAARGSPARPRRLPLLSVLLLPLLGGTQTAIVFIKQPSSQDALQG... 1 704 726 GSPPPYKMIQ TIGLSVGAAVAYIIAVLGLMFYC KKRCKAKRLQ\n98 P10586 MAPEPAPGRTMVPLVPALVMLGLVAGAHGDSKPVFIKVPEDQTGLS... 1 1262 1284 PAQQQEEPEM LWVTGPVLAVILIILIVIAILLF KRKRTHSPSS\n99 P28828 MRTLGTCLVTLAGLLLTAAGETFSGGCLFDEPYSTCGYSQADEDDF... 1 743 764 PEKQTDHTVK IAGVIAGILLFVIIFLGVVLVM KKRKLAKKRK\n\n[100 rows x 8 columns]", + "text/html": "
\n | entry | \nsequence | \nlabel | \ntmd_start | \ntmd_stop | \njmd_n | \ntmd | \njmd_c | \n
---|---|---|---|---|---|---|---|---|
0 | \nQ14802 | \nMQKVTLGLLVFLAGFPVLDANDLEDKNSPFYYDWHSLQVGGLICAG... | \n0 | \n37 | \n59 | \nNSPFYYDWHS | \nLQVGGLICAGVLCAMGIIIVMSA | \nKCKCKFGQKS | \n
1 | \nQ86UE4 | \nMAARSWQDELAQQAEEGSARLREMLSVGLGFLRTELGLDLGLEPKR... | \n0 | \n50 | \n72 | \nLGLEPKRYPG | \nWVILVGTGALGLLLLFLLGYGWA | \nAACAGARKKR | \n
2 | \nQ969W9 | \nMHRLMGVNSTAAAAAGQPNVSCTCNCKRSLFQSMEITELEFVQIII... | \n0 | \n41 | \n63 | \nFQSMEITELE | \nFVQIIIIVVVMMVMVVVITCLLS | \nHYKLSARSFI | \n
3 | \nP53801 | \nMAPGVARGPTPYWRLRLGGAALLLLLIPVAAAQEPPGAACSQNTNK... | \n0 | \n97 | \n119 | \nRWGVCWVNFE | \nALIITMSVVGGTLLLGIAICCCC | \nCCRRKRSRKP | \n
4 | \nQ8IUW5 | \nMAPRALPGSAVLAAAVFVGGAVSSPLVAPDNGSSRTLHSRTETTPS... | \n0 | \n59 | \n81 | \nNDTGNGHPEY | \nIAYALVPVFFIMGLFGVLICHLL | \nKKKGYRCTTE | \n
... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n... | \n
95 | \nP15209 | \nMSPWLKWHGPAMARLWGLCLLVLGFWRASLACPTSCKCSSARIWCT... | \n1 | \n431 | \n453 | \nVADQSNREHL | \nSVYAVVVIASVVGFCLLVMLLLL | \nKLARHSKFGM | \n
96 | \nQ86YL7 | \nMWKVSALLFVLGSASLWVLAEGASTGQPEDDTETTGLEGGVAMPGA... | \n1 | \n130 | \n152 | \nTVEKDGLSTV | \nTLVGIIVGVLLAIGFIGAIIVVV | \nMRKMSGRYSP | \n
97 | \nQ13308 | \nMGAARGSPARPRRLPLLSVLLLPLLGGTQTAIVFIKQPSSQDALQG... | \n1 | \n704 | \n726 | \nGSPPPYKMIQ | \nTIGLSVGAAVAYIIAVLGLMFYC | \nKKRCKAKRLQ | \n
98 | \nP10586 | \nMAPEPAPGRTMVPLVPALVMLGLVAGAHGDSKPVFIKVPEDQTGLS... | \n1 | \n1262 | \n1284 | \nPAQQQEEPEM | \nLWVTGPVLAVILIILIVIAILLF | \nKRKRTHSPSS | \n
99 | \nP28828 | \nMRTLGTCLVTLAGLLLTAAGETFSGGCLFDEPYSTCGYSQADEDDF... | \n1 | \n743 | \n764 | \nPEKQTDHTVK | \nIAGVIAGILLFVIIFLGVVLVM | \nKKRKLAKKRK | \n
100 rows × 8 columns
\n\n | SUEM840101 | \nNISK860101 | \nKANM800101 | \nCHOP780101 | \nMIYS990105 | \nFAUJ880103 | \nQIAN880126 | \nMUNV940105 | \nLINS030104 | \nJOND920101 | \n
---|---|---|---|---|---|---|---|---|---|---|
AA | \n\n | \n | \n | \n | \n | \n | \n | \n | \n | \n |
A | \n0.788 | \n0.406 | \n0.875 | \n0.174 | \n0.492 | \n0.124 | \n0.451 | \n0.175 | \n0.093 | \n0.818 | \n
C | \n0.544 | \n0.906 | \n0.312 | \n0.661 | \n0.016 | \n0.301 | \n0.324 | \n0.089 | \n0.000 | \n0.078 | \n
D | \n0.146 | \n0.006 | \n0.542 | \n0.908 | \n0.825 | \n0.344 | \n0.745 | \n0.337 | \n0.588 | \n0.494 | \n
E | \n0.622 | \n0.055 | \n1.000 | \n0.248 | \n0.857 | \n0.468 | \n0.471 | \n0.182 | \n0.804 | \n0.623 | \n
F | \n0.813 | \n0.968 | \n0.552 | \n0.119 | \n0.000 | \n0.729 | \n0.186 | \n0.066 | \n0.082 | \n0.338 | \n
G | \n0.000 | \n0.262 | \n0.115 | \n1.000 | \n0.492 | \n0.000 | \n0.676 | \n0.393 | \n0.144 | \n0.779 | \n
H | \n0.425 | \n0.559 | \n0.615 | \n0.440 | \n0.492 | \n0.577 | \n0.696 | \n0.125 | \n0.423 | \n0.117 | \n
I | \n0.901 | \n1.000 | \n0.583 | \n0.000 | \n0.079 | \n0.495 | \n0.314 | \n0.050 | \n0.010 | \n0.506 | \n
K | \n0.571 | \n0.000 | \n0.729 | \n0.495 | \n1.000 | \n0.590 | \n0.088 | \n0.155 | \n1.000 | \n0.584 | \n
L | \n0.901 | \n0.942 | \n0.719 | \n0.110 | \n0.016 | \n0.495 | \n0.059 | \n0.152 | \n0.041 | \n1.000 | \n
M | \n1.000 | \n0.788 | \n0.969 | \n0.119 | \n0.127 | \n0.548 | \n0.000 | \n0.083 | \n0.082 | \n0.130 | \n
N | \n0.317 | \n0.157 | \n0.385 | \n1.000 | \n0.683 | \n0.365 | \n0.608 | \n0.244 | \n0.557 | \n0.377 | \n
P | \n0.112 | \n0.118 | \n0.000 | \n0.963 | \n0.698 | \n0.337 | \n0.873 | \n1.000 | \n0.454 | \n0.481 | \n
Q | \n0.634 | \n0.145 | \n0.646 | \n0.468 | \n0.762 | \n0.489 | \n0.471 | \n0.152 | \n0.711 | \n0.351 | \n
R | \n0.726 | \n0.333 | \n0.500 | \n0.440 | \n0.651 | \n0.759 | \n0.529 | \n0.119 | \n0.845 | \n0.481 | \n
S | \n0.278 | \n0.137 | \n0.229 | \n0.881 | \n0.698 | \n0.198 | \n0.931 | \n0.109 | \n0.309 | \n0.714 | \n
T | \n0.371 | \n0.305 | \n0.302 | \n0.450 | \n0.603 | \n0.322 | \n1.000 | \n0.040 | \n0.330 | \n0.584 | \n
V | \n0.589 | \n0.884 | \n0.438 | \n0.028 | \n0.159 | \n0.371 | \n0.510 | \n0.000 | \n0.031 | \n0.675 | \n
W | \n0.847 | \n0.961 | \n0.469 | \n0.450 | \n0.095 | \n1.000 | \n0.196 | \n0.076 | \n0.206 | \n0.000 | \n
Y | \n0.704 | \n0.649 | \n0.281 | \n0.615 | \n0.159 | \n0.801 | \n0.853 | \n0.036 | \n0.268 | \n0.234 | \n
\n | tmd | \njmd_n_tmd_n | \ntmd_c_jmd_c | \n
---|---|---|---|
Q14802 | \nLQVGGLICAGVLCAMGIIIVMSA | \nNSPFYYDWHSLQVGGLICAGVL | \nCAMGIIIVMSAKCKCKFGQKS | \n
Q86UE4 | \nWVILVGTGALGLLLLFLLGYGWA | \nLGLEPKRYPGWVILVGTGALGL | \nLLLFLLGYGWAAACAGARKKR | \n
Q969W9 | \nFVQIIIIVVVMMVMVVVITCLLS | \nFQSMEITELEFVQIIIIVVVMM | \nVMVVVITCLLSHYKLSARSFI | \n
P53801 | \nALIITMSVVGGTLLLGIAICCCC | \nRWGVCWVNFEALIITMSVVGGT | \nLLLGIAICCCCCCRRKRSRKP | \n
Q8IUW5 | \nIAYALVPVFFIMGLFGVLICHLL | \nNDTGNGHPEYIAYALVPVFFIM | \nGLFGVLICHLLKKKGYRCTTE | \n
... | \n... | \n... | \n... | \n
P15209 | \nSVYAVVVIASVVGFCLLVMLLLL | \nVADQSNREHLSVYAVVVIASVV | \nGFCLLVMLLLLKLARHSKFGM | \n
Q86YL7 | \nTLVGIIVGVLLAIGFIGAIIVVV | \nTVEKDGLSTVTLVGIIVGVLLA | \nIGFIGAIIVVVMRKMSGRYSP | \n
Q13308 | \nTIGLSVGAAVAYIIAVLGLMFYC | \nGSPPPYKMIQTIGLSVGAAVAY | \nIIAVLGLMFYCKKRCKAKRLQ | \n
P10586 | \nLWVTGPVLAVILIILIVIAILLF | \nPAQQQEEPEMLWVTGPVLAVIL | \nIILIVIAILLFKRKRTHSPSS | \n
P28828 | \nIAGVIAGILLFVIIFLGVVLVM | \nPEKQTDHTVKIAGVIAGILLF | \nVIIFLGVVLVMKKRKLAKKRK | \n
100 rows × 3 columns
\n