diff --git a/aaanalysis/__pycache__/utils.cpython-39.pyc b/aaanalysis/__pycache__/utils.cpython-39.pyc index 7bd85d19..ce7d9e8b 100644 Binary files a/aaanalysis/__pycache__/utils.cpython-39.pyc and b/aaanalysis/__pycache__/utils.cpython-39.pyc differ diff --git a/aaanalysis/_utils/__init__.py b/aaanalysis/_utils/__init__.py index e69de29b..8b137891 100644 --- a/aaanalysis/_utils/__init__.py +++ b/aaanalysis/_utils/__init__.py @@ -0,0 +1 @@ + diff --git a/aaanalysis/_utils/__pycache__/__init__.cpython-39.pyc b/aaanalysis/_utils/__pycache__/__init__.cpython-39.pyc index ec36d995..a61deb51 100644 Binary files a/aaanalysis/_utils/__pycache__/__init__.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/__init__.cpython-39.pyc differ diff --git a/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc b/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc index 07d15af7..1c0a77f9 100644 Binary files a/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc differ diff --git a/aaanalysis/_utils/__pycache__/_utils_constants.cpython-39.pyc b/aaanalysis/_utils/__pycache__/_utils_constants.cpython-39.pyc deleted file mode 100644 index 7bd6f4e7..00000000 Binary files a/aaanalysis/_utils/__pycache__/_utils_constants.cpython-39.pyc and /dev/null differ diff --git a/aaanalysis/_utils/__pycache__/_utils_output.cpython-39.pyc b/aaanalysis/_utils/__pycache__/_utils_output.cpython-39.pyc index 98d80c94..f93d2f5d 100644 Binary files a/aaanalysis/_utils/__pycache__/_utils_output.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/_utils_output.cpython-39.pyc differ diff --git a/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc b/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc index 59668b2c..e38ecd09 100644 Binary files a/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc differ diff --git a/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc b/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc index 6b292095..b7919e8f 100644 Binary files a/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc differ diff --git a/aaanalysis/_utils/_utils_constants.py b/aaanalysis/_utils/_utils_constants.py deleted file mode 100644 index 30b103e1..00000000 --- a/aaanalysis/_utils/_utils_constants.py +++ /dev/null @@ -1,91 +0,0 @@ -""" -This is a script containing constant names, column names, or colors. - -AAanalysis comprises these primary pd.DataFrames: df_seq, df_part, df_cat, df_scales, df_feat - -""" - -# Default scale datasets for protein analysis -STR_SCALES = "scales" # Min-max normalized scales (from AAontology) -STR_SCALES_RAW = "scales_raw" # Raw scales (from AAontology) -STR_SCALES_PC = "scales_pc" # AAclust pc-based scales (pc: principal component) -STR_SCALE_CAT = "scales_cat" # AAontology -STR_TOP60 = "top60" # AAclustTop60 -STR_TOP60_EVAL = "top60_eval" # AAclustTop60 evaluation -NAMES_SCALE_SETS = [STR_SCALES, STR_SCALES_RAW, STR_SCALE_CAT, - STR_SCALES_PC, STR_TOP60, STR_TOP60_EVAL] - - -# Column names for primary df -# df_seq -COL_ENTRY = "entry" # ACC, protein entry, uniprot id -COL_NAME = "name" # Entry name, Protein name, Uniprot Name -COL_LABEL = "label" -COL_SEQ = "sequence" -COLS_PARTS = ["jmd_n", "tmd", "jmd_c"] -COL_TMD_START = "tmd_start" -COL_TMD_STOP = "tmd_stop" -COLS_SEQ_KEY = [COL_ENTRY, COL_SEQ, COL_LABEL] -COLS_SEQ_TMD_POS_KEY = [COL_SEQ, COL_TMD_START, COL_TMD_STOP] # TODO adjust to COL_ENTRY -COLS_SEQ_TMD_PART_KEY = [COL_ENTRY, COL_SEQ] + COLS_PARTS -# df_part - -# df_scales -# Column for df_cat (as defined in AAontology, retrieved by aa.load_scales(name="scale_cat")) -COL_SCALE_ID = "scale_id" -COL_CAT = "category" -COL_SUBCAT = "subcategory" -COL_SCALE_NAME = "scale_name" -COL_SCALE_DES = "scale_description" - - -# Columns for df_feat -COL_FEATURE = "feature" -# COL_CAT, COL_SUBCAT, COL_SCALE_NAME, COL_SCALE_DES -COL_ABS_AUC = "abs_auc" -COL_ABS_MEAN_DIF = "abs_mean_dif" -COL_MEAN_DIF = "mean_dif" -COL_STD_TEST = "std_test" -COL_STD_REF = "std_ref" -COL_PVAL_MW = "p_val_mann_whitney" -COL_PVAL_FDR = "p_val_fdr_bh" -COL_POSITION = "positions" - -# Columns for df_feat after processing with explainable AI methods -COL_FEAT_IMPORTANCE = "feat_importance" -COO_FEAT_IMP_STD = "feat_importance_std" -COL_FEAT_IMPACT = "feat_impact" - - -# Column name datasets (DOM_GSEC) - - - -# Column names cpp features - - -# Standard colors -COLOR_SHAP_POS = '#FF0D57' # (255, 13, 87) -COLOR_SHAP_NEG = '#1E88E5' # (30, 136, 229) -COLOR_FEAT_POS = '#9D2B39' # (157, 43, 57) Mean difference -COLOR_FEAT_NEG = '#326599' # (50, 101, 133) Mean difference -COLOR_FEAT_IMP = '#7F7F7F' # (127, 127, 127) feature importance -COLOR_TMD = '#00FA9A' # (0, 250, 154) -COLOR_JMD = '#0000FF' # (0, 0, 255) - -DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS, - "SHAP_NEG": COLOR_SHAP_NEG, - "FEAT_POS": COLOR_FEAT_POS, - "FEAT_NEG": COLOR_FEAT_NEG, - "FEAT_IMP": COLOR_FEAT_IMP, - "TMD": COLOR_TMD, - "JMD": COLOR_JMD} - -DICT_COLOR_CAT = {"ASA/Volume": "tab:blue", - "Composition": "tab:orange", - "Conformation": "tab:green", - "Energy": "tab:red", - "Others": "tab:gray", - "Polarity": "gold", - "Shape": "tab:cyan", - "Structure-Activity": "tab:brown"} \ No newline at end of file diff --git a/aaanalysis/_utils/utils_aaclust.py b/aaanalysis/_utils/utils_aaclust.py index 01038ca6..26c45173 100644 --- a/aaanalysis/_utils/utils_aaclust.py +++ b/aaanalysis/_utils/utils_aaclust.py @@ -7,7 +7,6 @@ METRIC_CORRELATION = "correlation" LIST_METRICS = [METRIC_CORRELATION, "manhattan", "euclidean", "cosine"] - # Check functions def check_model(model=None, model_kwargs=None, except_None=True): """""" diff --git a/aaanalysis/_utils/utils_cpp.py b/aaanalysis/_utils/utils_cpp.py index d9ca36ae..e701eb11 100644 --- a/aaanalysis/_utils/utils_cpp.py +++ b/aaanalysis/_utils/utils_cpp.py @@ -2,13 +2,10 @@ This is a script with utility functions and settings for CPP project. """ import numpy as np -import pandas as pd import matplotlib.colors as mcolors import matplotlib.pyplot as plt -import aaanalysis._utils._utils_constants as ut_c import aaanalysis._utils._utils_check as ut_check -import aaanalysis._utils._utils_output as ut_o # Settings @@ -104,7 +101,7 @@ def _check_seq(seq, len_, name_seq, name_len): raise ValueError(f"The length of {seq} ({len(seq)}) should be >= {name_len} ({len_}).") return len(seq) - +# TODO split in two (separation of concerns) def check_args_len(tmd_len=None, jmd_n_len=None, jmd_c_len=None, ext_len=None, tmd_seq=None, jmd_n_seq=None, jmd_c_seq=None, accept_tmd_none=False): """Check length parameters and if they are matching with sequences if provided""" @@ -116,6 +113,7 @@ def check_args_len(tmd_len=None, jmd_n_len=None, jmd_c_len=None, ext_len=None, ut_check.check_non_negative_number(name="ext_len", val=ext_len, accept_none=True) # Check if lengths and sequences match tmd_len = _check_seq(tmd_seq, tmd_len, "tmd_seq", "tmd_len") + print(jmd_n_seq) jmd_n_len = _check_seq(jmd_n_seq, jmd_n_len, "jmd_n_seq", "jmd_n_len") jmd_c_len = _check_seq(jmd_c_seq, jmd_c_len, "jmd_c_seq", "jmd_c_len") # Check if lengths are matching @@ -269,7 +267,7 @@ def check_split(split=None): # Scale functions def get_dict_all_scales(df_scales=None): - """Get nested dictionary where each scales is a key for a amino acid scale value dictionary""" + """Get nested dictionary where each scale is a key for an amino acid scale value dictionary""" dict_all_scales = {col: dict(zip(df_scales.index.to_list(), df_scales[col])) for col in list(df_scales)} return dict_all_scales diff --git a/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc b/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc index 192664dc..27cd1216 100644 Binary files a/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc and b/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc differ diff --git a/aaanalysis/cpp/feature.py b/aaanalysis/cpp/feature.py index 1dc1e13c..9f6df6a0 100644 --- a/aaanalysis/cpp/feature.py +++ b/aaanalysis/cpp/feature.py @@ -164,7 +164,7 @@ def _feature_matrix(feat_names, dict_all_scales, df_parts, accept_gaps): accept_gaps=accept_gaps) return feat_matrix - + # II Main Functions class SequenceFeature: """Retrieve and create sequence feature components (Part, Split, and Scale). diff --git a/aaanalysis/utils.py b/aaanalysis/utils.py index 0a3f6736..d499a5ef 100644 --- a/aaanalysis/utils.py +++ b/aaanalysis/utils.py @@ -4,16 +4,22 @@ import os import platform from functools import lru_cache +import pandas as pd +import numpy as np -# Import utility functions for specific purposes -from aaanalysis._utils._utils_constants import * -from aaanalysis._utils._utils_check import * -from aaanalysis._utils._utils_output import * - -# Import utility function for specific modules -from aaanalysis._utils.utils_aaclust import * -from aaanalysis._utils.utils_cpp import * - +# Import utility functions explicitly +from aaanalysis._utils._utils_check import (check_non_negative_number, check_float, check_str, check_bool, + check_dict, check_tuple, + check_feat_matrix, check_col_in_df) +from aaanalysis._utils._utils_output import (print_red, print_start_progress, print_progress, print_finished_progress) +from aaanalysis._utils.utils_aaclust import (check_model, check_min_th, check_merge_metric, + METRIC_CORRELATION, LIST_METRICS) +from aaanalysis._utils.utils_cpp import (check_color, check_y_categorical, check_labels, check_ylim, + check_args_len, check_list_parts, check_split_kws, check_split, + get_dict_all_scales, get_vf_scale, + STR_SEGMENT, STR_PATTERN, STR_PERIODIC_PATTERN, STR_AA_GAP, + LIST_PARTS, LIST_ALL_PARTS, SPLIT_DESCRIPTION) +#from aaanalysis.utils.utils_dpulearn import () # I Folder structure def _folder_path(super_folder, folder_name): @@ -28,7 +34,95 @@ def _folder_path(super_folder, folder_name): URL_DATA = "https://github.com/breimanntools/aaanalysis/tree/master/aaanalysis/data/" +# Constants +# Default scale datasets for protein analysis +STR_SCALES = "scales" # Min-max normalized scales (from AAontology) +STR_SCALES_RAW = "scales_raw" # Raw scales (from AAontology) +STR_SCALES_PC = "scales_pc" # AAclust pc-based scales (pc: principal component) +STR_SCALE_CAT = "scales_cat" # AAontology +STR_TOP60 = "top60" # AAclustTop60 +STR_TOP60_EVAL = "top60_eval" # AAclustTop60 evaluation +NAMES_SCALE_SETS = [STR_SCALES, STR_SCALES_RAW, STR_SCALE_CAT, + STR_SCALES_PC, STR_TOP60, STR_TOP60_EVAL] + + +# Column names for primary df +# df_seq +COL_ENTRY = "entry" # ACC, protein entry, uniprot id +COL_NAME = "name" # Entry name, Protein name, Uniprot Name +COL_LABEL = "label" +COL_SEQ = "sequence" +COLS_PARTS = ["jmd_n", "tmd", "jmd_c"] +COL_TMD_START = "tmd_start" +COL_TMD_STOP = "tmd_stop" +COLS_SEQ_KEY = [COL_ENTRY, COL_SEQ, COL_LABEL] +COLS_SEQ_TMD_POS_KEY = [COL_SEQ, COL_TMD_START, COL_TMD_STOP] # TODO adjust to COL_ENTRY +COLS_SEQ_TMD_PART_KEY = [COL_ENTRY, COL_SEQ] + COLS_PARTS +# df_part + +# df_scales +# Column for df_cat (as defined in AAontology, retrieved by aa.load_scales(name="scale_cat")) +COL_SCALE_ID = "scale_id" +COL_CAT = "category" +COL_SUBCAT = "subcategory" +COL_SCALE_NAME = "scale_name" +COL_SCALE_DES = "scale_description" + + +# Columns for df_feat +COL_FEATURE = "feature" +# COL_CAT, COL_SUBCAT, COL_SCALE_NAME, COL_SCALE_DES +COL_ABS_AUC = "abs_auc" +COL_ABS_MEAN_DIF = "abs_mean_dif" +COL_MEAN_DIF = "mean_dif" +COL_STD_TEST = "std_test" +COL_STD_REF = "std_ref" +COL_PVAL_MW = "p_val_mann_whitney" +COL_PVAL_FDR = "p_val_fdr_bh" +COL_POSITION = "positions" + +# Columns for df_feat after processing with explainable AI methods +COL_FEAT_IMPORTANCE = "feat_importance" +COO_FEAT_IMP_STD = "feat_importance_std" +COL_FEAT_IMPACT = "feat_impact" + + +# Column name datasets (DOM_GSEC) + + + +# Column names cpp features + + +# Standard colors +COLOR_SHAP_POS = '#FF0D57' # (255, 13, 87) +COLOR_SHAP_NEG = '#1E88E5' # (30, 136, 229) +COLOR_FEAT_POS = '#9D2B39' # (157, 43, 57) Mean difference +COLOR_FEAT_NEG = '#326599' # (50, 101, 133) Mean difference +COLOR_FEAT_IMP = '#7F7F7F' # (127, 127, 127) feature importance +COLOR_TMD = '#00FA9A' # (0, 250, 154) +COLOR_JMD = '#0000FF' # (0, 0, 255) + +DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS, + "SHAP_NEG": COLOR_SHAP_NEG, + "FEAT_POS": COLOR_FEAT_POS, + "FEAT_NEG": COLOR_FEAT_NEG, + "FEAT_IMP": COLOR_FEAT_IMP, + "TMD": COLOR_TMD, + "JMD": COLOR_JMD} + +DICT_COLOR_CAT = {"ASA/Volume": "tab:blue", + "Composition": "tab:orange", + "Conformation": "tab:green", + "Energy": "tab:red", + "Others": "tab:gray", + "Polarity": "gold", + "Shape": "tab:cyan", + "Structure-Activity": "tab:brown"} + + # II MAIN FUNCTIONS +# Main Helper functions # Caching for data loading for better performance (data loaded ones) @lru_cache(maxsize=None) def read_excel_cached(name, index_col=None): @@ -43,32 +137,33 @@ def read_csv_cached(name, sep=None): return df.copy() +# Main check functions # Check key dataframes using constants and general checking functions (df_seq, df_parts, df_cat, df_scales, df_feat) def check_df_seq(df_seq=None, jmd_n_len=None, jmd_c_len=None): """Get features from df""" # TODO check if df_seq is None or not isinstance(df_seq, pd.DataFrame): raise ValueError("Type of 'df_seq' ({}) must be pd.DataFrame".format(type(df_seq))) - if ut_c.COL_ENTRY not in list(df_seq): - raise ValueError("'{}' must be in 'df_seq'".format(ut_c.COL_ENTRY)) - seq_info_in_df = set(ut_c.COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq)) - parts_in_df = set(ut_c.COLS_PARTS).issubset(set(df_seq)) - seq_in_df = ut_c.COL_SEQ in set(df_seq) + if COL_ENTRY not in list(df_seq): + raise ValueError("'{}' must be in 'df_seq'".format(COL_ENTRY)) + seq_info_in_df = set(COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq)) + parts_in_df = set(COLS_PARTS).issubset(set(df_seq)) + seq_in_df = COL_SEQ in set(df_seq) if "start" in list(df_seq): - raise ValueError(f"'df_seq' should not contain 'start' in columns. Change column to '{ut_c.COL_TMD_START}'.") + raise ValueError(f"'df_seq' should not contain 'start' in columns. Change column to '{COL_TMD_START}'.") if "stop" in list(df_seq): - raise ValueError(f"'df_seq' should not contain 'stop' in columns. Change column to '{ut_c.COL_TMD_STOP}'.") + raise ValueError(f"'df_seq' should not contain 'stop' in columns. Change column to '{COL_TMD_STOP}'.") if not (seq_info_in_df or parts_in_df or seq_in_df): - raise ValueError(f"'df_seq' should contain ['{ut_c.COL_SEQ}'], {ut_c.COLS_SEQ_TMD_POS_KEY}, or {ut_c.COLS_PARTS}") + raise ValueError(f"'df_seq' should contain ['{COL_SEQ}'], {COLS_SEQ_TMD_POS_KEY}, or {COLS_PARTS}") # Check data type in part or sequence columns else: if seq_info_in_df or seq_in_df: - error = f"Sequence column ('{ut_c.COL_SEQ}') should only contain strings" - dict_wrong_seq = {ut_c.COL_SEQ: [x for x in df_seq[ut_c.COL_SEQ].values if type(x) != str]} + error = f"Sequence column ('{COL_SEQ}') should only contain strings" + dict_wrong_seq = {COL_SEQ: [x for x in df_seq[COL_SEQ].values if type(x) != str]} else: - cols = ut_c.COLS_PARTS + cols = COLS_PARTS error = f"Part columns ('{cols}') should only contain strings" - dict_wrong_seq = {part: [x for x in df_seq[part].values if type(x) != str] for part in ut_c.COLS_PARTS} + dict_wrong_seq = {part: [x for x in df_seq[part].values if type(x) != str] for part in COLS_PARTS} # Filter empty lists dict_wrong_seq = {part: dict_wrong_seq[part] for part in dict_wrong_seq if len(dict_wrong_seq[part]) > 0} n_wrong_entries = sum([len(dict_wrong_seq[part]) for part in dict_wrong_seq]) @@ -78,33 +173,33 @@ def check_df_seq(df_seq=None, jmd_n_len=None, jmd_c_len=None): # Check if only sequence given -> Convert sequence to tmd if seq_in_df and not parts_in_df: if seq_info_in_df: - for entry, start, stop in zip(df_seq[ut_c.COL_ENTRY], df_seq[ut_c.COL_TMD_START], df_seq[ut_c.COL_TMD_STOP]): - ut_check.check_non_negative_number(name=f"tmd_start [{entry}]", val=start) - ut_check.check_non_negative_number(name=f"tmd_start [{entry}]", val=stop,) - tmd_start = [int(x) for x in df_seq[ut_c.COL_TMD_START]] - tmd_stop = [int(x) for x in df_seq[ut_c.COL_TMD_STOP]] + for entry, start, stop in zip(df_seq[COL_ENTRY], df_seq[COL_TMD_START], df_seq[COL_TMD_STOP]): + check_non_negative_number(name=f"tmd_start [{entry}]", val=start) + check_non_negative_number(name=f"tmd_start [{entry}]", val=stop,) + tmd_start = [int(x) for x in df_seq[COL_TMD_START]] + tmd_stop = [int(x) for x in df_seq[COL_TMD_STOP]] else: tmd_start = 1 if jmd_n_len is None else 1 + jmd_n_len - tmd_stop = [len(x)-1 for x in df_seq[ut_c.COL_SEQ]] + tmd_stop = [len(x)-1 for x in df_seq[COL_SEQ]] if jmd_c_len is not None: tmd_stop = [x - jmd_c_len for x in tmd_stop] - df_seq[ut_c.COL_TMD_START] = tmd_start - df_seq[ut_c.COL_TMD_STOP] = tmd_stop - seq_info_in_df = set(ut_c.COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq)) + df_seq[COL_TMD_START] = tmd_start + df_seq[COL_TMD_STOP] = tmd_stop + seq_info_in_df = set(COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq)) # Check parameter combinations if [jmd_n_len, jmd_c_len].count(None) == 1: raise ValueError("'jmd_n_len' and 'jmd_c_len' should both be given (not None) or None") if not parts_in_df and seq_info_in_df and jmd_n_len is None and jmd_c_len is None: error = f"'jmd_n_len' and 'jmd_c_len' should not be None if " \ - f"sequence information ({ut_c.COLS_SEQ_TMD_POS_KEY}) are given." + f"sequence information ({COLS_SEQ_TMD_POS_KEY}) are given." raise ValueError(error) if not seq_info_in_df and jmd_n_len is not None and jmd_c_len is not None: - error = f"If not all sequence information ({ut_c.COLS_SEQ_TMD_POS_KEY}) are given," \ + error = f"If not all sequence information ({COLS_SEQ_TMD_POS_KEY}) are given," \ f"'jmd_n_len' and 'jmd_c_len' should be None." raise ValueError(error) if not parts_in_df and seq_info_in_df and (jmd_c_len is None or jmd_n_len is None): error = "If part columns ({}) are not in 'df_seq' but sequence information ({}), " \ - "\n'jmd_n_len' and 'jmd_c_len' should be given (not None).".format(ut_c.COLS_PARTS, ut_c.COLS_SEQ_TMD_POS_KEY) + "\n'jmd_n_len' and 'jmd_c_len' should be given (not None).".format(COLS_PARTS, COLS_SEQ_TMD_POS_KEY) raise ValueError(error) return df_seq @@ -145,17 +240,17 @@ def check_df_cat(df_cat=None, df_scales=None, accept_none=True, verbose=True): if not isinstance(df_cat, pd.DataFrame): raise ValueError("'df_cat' should be type pd.DataFrame (not {})".format(type(df_cat))) # Check columns - for col in [ut_c.COL_SCALE_ID, ut_c.COL_CAT, ut_c.COL_SUBCAT]: + for col in [COL_SCALE_ID, COL_CAT, COL_SUBCAT]: if col not in df_cat: raise ValueError(f"'{col}' not in 'df_cat'") # Check scales from df_cat and df_scales do match if df_scales is not None: - scales_cat = list(df_cat[ut_c.COL_SCALE_ID]) + scales_cat = list(df_cat[COL_SCALE_ID]) scales = list(df_scales) overlap_scales = [x for x in scales if x in scales_cat] difference_scales = list(set(scales).difference(set(scales_cat))) # Adjust df_cat and df_scales - df_cat = df_cat[df_cat[ut_c.COL_SCALE_ID].isin(overlap_scales)] + df_cat = df_cat[df_cat[COL_SCALE_ID].isin(overlap_scales)] df_scales = df_scales[overlap_scales] if verbose and len(difference_scales) > 0: str_warning = f"Scales from 'df_scales' and 'df_cat' do not overlap completely." @@ -171,7 +266,7 @@ def check_df_cat(df_cat=None, df_scales=None, accept_none=True, verbose=True): def check_df_scales(df_scales=None, df_parts=None, accept_none=False, accept_gaps=False): """Check if df_scales is a valid input and matching to df_parts""" - ut_check.check_bool(name="accept_gaps", val=accept_gaps) + check_bool(name="accept_gaps", val=accept_gaps) if accept_none and df_scales is None: return # Skip check if not isinstance(df_scales, pd.DataFrame): @@ -223,16 +318,16 @@ def check_df_feat(df_feat=None, df_cat=None): if len(df_feat) == 0 or len(list(df_feat)) == 0: raise ValueError("'df_feat' should be not empty") # Check if feature column in df_feat - if ut_c.COL_FEATURE not in df_feat: - raise ValueError(f"'{ut_c.COL_FEATURE}' must be column in 'df_feat'") - list_feat = list(df_feat[ut_c.COL_FEATURE]) + if COL_FEATURE not in df_feat: + raise ValueError(f"'{COL_FEATURE}' must be column in 'df_feat'") + list_feat = list(df_feat[COL_FEATURE]) for feat in list_feat: if feat.count("-") != 2: raise ValueError(f"'{feat}' is no valid feature") # Check if df_feat matches df_cat if df_cat is not None: scales = set([x.split("-")[2] for x in list_feat]) - list_scales = list(df_cat[ut_c.COL_SCALE_ID]) + list_scales = list(df_cat[COL_SCALE_ID]) missing_scales = [x for x in scales if x not in list_scales] if len(missing_scales) > 0: raise ValueError(f"Following scales occur in 'df_feat' but not in 'df_cat': {missing_scales}") diff --git a/tests/_utils.py b/tests/_utils.py index 205b41f5..7dd20404 100644 --- a/tests/_utils.py +++ b/tests/_utils.py @@ -9,7 +9,7 @@ # Helper Function def _folder_path(super_folder, folder_name): - """Modification of separator (OS depending)""" + """Modification of separator (OS-depending)""" path = os.path.join(super_folder, folder_name + SEP) return path diff --git a/tests/unit/.hypothesis/unicode_data/13.0.0/codec-utf-8.json.gz b/tests/unit/.hypothesis/unicode_data/13.0.0/codec-utf-8.json.gz index 86a33b00..7df482fa 100644 Binary files a/tests/unit/.hypothesis/unicode_data/13.0.0/codec-utf-8.json.gz and b/tests/unit/.hypothesis/unicode_data/13.0.0/codec-utf-8.json.gz differ diff --git a/tests/unit/test_cpp_feature.py b/tests/unit/test_cpp_feature.py index 9bb77b8e..53be9b61 100644 --- a/tests/unit/test_cpp_feature.py +++ b/tests/unit/test_cpp_feature.py @@ -11,55 +11,6 @@ # I Unit Tests -class TestLoadScales: - """Unit test for loading scales""" - - # Positive unit test - def test_load_data(self): - """Unit test for aa.SequenceFeature().load_scales() method""" - sf = aa.SequenceFeature() - assert isinstance(sf.load_scales(clust_th=0.5), pd.DataFrame) - - # Negative test - def test_wrong_clustered_values(self): - sf = aa.SequenceFeature() - for i in [0.1, -0.2, "a", None]: - with pytest.raises(ValueError): - sf.load_scales(clust_th=i) - - # Property-based testing - @given(clustered=some.floats(min_value=-10, max_value=10)) - def test_clustered_integer(self, clustered): - sf = aa.SequenceFeature() - if clustered not in [0.5, 0.7]: - with pytest.raises(ValueError): - sf.load_scales(clust_th=clustered) - - -class TestLoadCategories: - """Unit test for loading DataFrame with sequence categories""" - - # Positive unit test - def test_load_categories(self): - sf = aa.SequenceFeature() - assert isinstance(aa.load_scales(clust_th=0.5), pd.DataFrame) - - # Negative test - def test_wrong_clustered_values(self): - sf = aa.SequenceFeature() - for i in [0.1, -0.2, "a", None]: - with pytest.raises(ValueError): - aa.load_scales(clust_th=i) - - # Property-based testing - @given(clustered=some.floats(min_value=-10, max_value=10)) - def test_clustered_integer(self, clustered): - sf = aa.SequenceFeature() - if clustered not in [0.5, 0.7]: - with pytest.raises(ValueError): - aa.load_scales(clust_th=clustered) - - class TestGetDfParts: """Unit test for loading DataFrame with sequence parts""" @@ -407,10 +358,10 @@ def test_sequence_feature(list_splits): """Positive regression/functional test of all aa.SequenceFeature() methods""" sf = aa.SequenceFeature() # Get test set of sequences - df_seq = sf.load_sequences() + df_seq = aa.load_dataset() # Get feature components df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=False) - df_scales = sf.load_scales() + df_scales = aa.load_scales() split_kws = sf.get_split_kws() # Get features (names, values, matrix) features = sf.get_features()[0:100] diff --git a/tutorials/prelude_on_plotting.ipynb b/tutorials/prelude_on_plotting.ipynb new file mode 100644 index 00000000..29cd7437 --- /dev/null +++ b/tutorials/prelude_on_plotting.ipynb @@ -0,0 +1,35 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/tutorial1_quick_start.ipynb b/tutorials/tutorial1_quick_start.ipynb old mode 100755 new mode 100644 index 008c3d8d..13033a25 --- a/tutorials/tutorial1_quick_start.ipynb +++ b/tutorials/tutorial1_quick_start.ipynb @@ -1,22 +1,55 @@ { "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Quick Start with AAanalysis\n", + "Dive into the powerful capabilities of ``AAanalysis``—a Python framework dedicated to sequence-based, alignment-free protein prediction. In this tutorial, using gamma-secretase substrates and non-substrates as an example, we'll focus on extracting interpretable features from protein sequences using the ``AAclust`` and ``CPP`` models and how they can be harnessed for binary classification tasks.\n", + "\n", + "What You Will Learn:\n", + "- ``Loading Sequences and Scales``: How to easily load protein sequences and their amino acid scales.\n", + "- ``Feature Engineering``: Extract essential features using the ``AAclust`` and ``CPP`` models.\n", + "- ``Protein Prediction``: Make predictions using the RandomForest model.\n", + "- ``Explainable AI``: Interpret predictions at the group and individual levels by combining ``CPP`` with ``SHAP``.\n", + "\n", + "## 1. Loading Sequences and Scales\n", + "With AAanalysis, you have access to numerous benchmark datasets for protein sequence analysis. Using our γ-secretase substrates and non-substrates dataset as a hands-on example, you can effortlessly retrieve these datasets using the ``aa.load_dataset()`` function. Furthermore, amino acid scales, predominantly from AAindex, along with their hierarchical classification (known as ``AAontology``), are available at your fingertips with the ``aa.load_scales()`` function." + ], + "metadata": { + "collapsed": false + } + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "pycharm": { - "name": "#%%\n", - "is_executing": true + "name": "#%%\n" + }, + "ExecuteTime": { + "end_time": "2023-09-23T14:15:04.562034649Z", + "start_time": "2023-09-23T14:15:04.508201346Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": " entry sequence label tmd_start tmd_stop jmd_n tmd jmd_c\n0 Q14802 MQKVTLGLLVFLAGFPVLDANDLEDKNSPFYYDWHSLQVGGLICAG... 0 37 59 NSPFYYDWHS LQVGGLICAGVLCAMGIIIVMSA KCKCKFGQKS\n1 Q86UE4 MAARSWQDELAQQAEEGSARLREMLSVGLGFLRTELGLDLGLEPKR... 0 50 72 LGLEPKRYPG WVILVGTGALGLLLLFLLGYGWA AACAGARKKR\n2 Q969W9 MHRLMGVNSTAAAAAGQPNVSCTCNCKRSLFQSMEITELEFVQIII... 0 41 63 FQSMEITELE FVQIIIIVVVMMVMVVVITCLLS HYKLSARSFI\n3 P53801 MAPGVARGPTPYWRLRLGGAALLLLLIPVAAAQEPPGAACSQNTNK... 0 97 119 RWGVCWVNFE ALIITMSVVGGTLLLGIAICCCC CCRRKRSRKP\n4 Q8IUW5 MAPRALPGSAVLAAAVFVGGAVSSPLVAPDNGSSRTLHSRTETTPS... 0 59 81 NDTGNGHPEY IAYALVPVFFIMGLFGVLICHLL KKKGYRCTTE\n.. ... ... ... ... ... ... ... ...\n95 P15209 MSPWLKWHGPAMARLWGLCLLVLGFWRASLACPTSCKCSSARIWCT... 1 431 453 VADQSNREHL SVYAVVVIASVVGFCLLVMLLLL KLARHSKFGM\n96 Q86YL7 MWKVSALLFVLGSASLWVLAEGASTGQPEDDTETTGLEGGVAMPGA... 1 130 152 TVEKDGLSTV TLVGIIVGVLLAIGFIGAIIVVV MRKMSGRYSP\n97 Q13308 MGAARGSPARPRRLPLLSVLLLPLLGGTQTAIVFIKQPSSQDALQG... 1 704 726 GSPPPYKMIQ TIGLSVGAAVAYIIAVLGLMFYC KKRCKAKRLQ\n98 P10586 MAPEPAPGRTMVPLVPALVMLGLVAGAHGDSKPVFIKVPEDQTGLS... 1 1262 1284 PAQQQEEPEM LWVTGPVLAVILIILIVIAILLF KRKRTHSPSS\n99 P28828 MRTLGTCLVTLAGLLLTAAGETFSGGCLFDEPYSTCGYSQADEDDF... 1 743 764 PEKQTDHTVK IAGVIAGILLFVIIFLGVVLVM KKRKLAKKRK\n\n[100 rows x 8 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
entrysequencelabeltmd_starttmd_stopjmd_ntmdjmd_c
0Q14802MQKVTLGLLVFLAGFPVLDANDLEDKNSPFYYDWHSLQVGGLICAG...03759NSPFYYDWHSLQVGGLICAGVLCAMGIIIVMSAKCKCKFGQKS
1Q86UE4MAARSWQDELAQQAEEGSARLREMLSVGLGFLRTELGLDLGLEPKR...05072LGLEPKRYPGWVILVGTGALGLLLLFLLGYGWAAACAGARKKR
2Q969W9MHRLMGVNSTAAAAAGQPNVSCTCNCKRSLFQSMEITELEFVQIII...04163FQSMEITELEFVQIIIIVVVMMVMVVVITCLLSHYKLSARSFI
3P53801MAPGVARGPTPYWRLRLGGAALLLLLIPVAAAQEPPGAACSQNTNK...097119RWGVCWVNFEALIITMSVVGGTLLLGIAICCCCCCRRKRSRKP
4Q8IUW5MAPRALPGSAVLAAAVFVGGAVSSPLVAPDNGSSRTLHSRTETTPS...05981NDTGNGHPEYIAYALVPVFFIMGLFGVLICHLLKKKGYRCTTE
...........................
95P15209MSPWLKWHGPAMARLWGLCLLVLGFWRASLACPTSCKCSSARIWCT...1431453VADQSNREHLSVYAVVVIASVVGFCLLVMLLLLKLARHSKFGM
96Q86YL7MWKVSALLFVLGSASLWVLAEGASTGQPEDDTETTGLEGGVAMPGA...1130152TVEKDGLSTVTLVGIIVGVLLAIGFIGAIIVVVMRKMSGRYSP
97Q13308MGAARGSPARPRRLPLLSVLLLPLLGGTQTAIVFIKQPSSQDALQG...1704726GSPPPYKMIQTIGLSVGAAVAYIIAVLGLMFYCKKRCKAKRLQ
98P10586MAPEPAPGRTMVPLVPALVMLGLVAGAHGDSKPVFIKVPEDQTGLS...112621284PAQQQEEPEMLWVTGPVLAVILIILIVIAILLFKRKRTHSPSS
99P28828MRTLGTCLVTLAGLLLTAAGETFSGGCLFDEPYSTCGYSQADEDDF...1743764PEKQTDHTVKIAGVIAGILLFVIIFLGVVLVMKKRKLAKKRK
\n

100 rows × 8 columns

\n
" + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "import numpy as np\n", - "from sklearn.cluster import AgglomerativeClustering\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.model_selection import cross_val_score\n", - "\n", - "import aaanalysis as aa" + "import aaanalysis as aa\n", + "# Load scales and scale categories (AAontology) \n", + "df_scales = aa.load_scales()\n", + "df_cat = aa.load_scales(name=\"scales_cat\")\n", + "# Load training data\n", + "df_seq = aa.load_dataset(name=\"DOM_GSEC\", n=50)\n", + "df_seq" ] }, { @@ -27,100 +60,144 @@ } }, "source": [ - "# Loading scales and sequence data using AAanalysis\n", - "AAanalysis is a python framework for sequence-based and alignment-free protein prediction. It comprises several example datasets for sequence and residue predictions tasks, which can be retrieved using the aa.load_dataset() function. Amino acid scales (most from AAindex) and their hierarchical classification (named AAontology) can be accessed using the aa.load_scales() function. Since redundancy is an essential problem for machine learning tasks, the AAclust() object provides a lightweight wrapper for sklearn clustering algorithms such as Agglomerative clustering. AAclust clusters a set of scales and selects for each cluster the most representative scale (i.e., the scale closes to the cluster center)." + "## 2. Feature Engineering\n", + "The centerpiece of AAanalysis is the Comparative Physicochemical Profiling (``CPP``) model, which is supported by ``AAclust`` for the pre-selection of amino acid scales. \n", + "\n", + "### AAclust\n", + "Since redundancy is an essential problem for machine learning tasks, the ``AAclust`` object provides a lightweight wrapper for sklearn clustering algorithms such as Agglomerative clustering. AAclust clusters a set of scales and selects for each cluster the most representative scale (i.e., the scale closes to the cluster center)." ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "execution_count": 9, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - " entry sequence label\n", - "1 DISULFIDE_2 PLHHLXIGTWTPPGAIFTVQFDDEKLTCKLIKRTEIPQDEPISWXT... 0\n", - "4 DISULFIDE_5 GFPELKNDTFLRAAWGEETDYTPVWCMRQAGRYLPEFRETRAAQDF... 0\n", - "5 DISULFIDE_6 MKIIRIETSRIAVPLTKPFKTALRTVYTAESVIVRITYDSGAVGWG... 0\n", - "7 DISULFIDE_8 AAERVFISPAKYVQGKNVITKIANYLEGIGNKTVVIADEIVWKIAG... 0\n", - "9 DISULFIDE_10 MKFTVEREHLLKPLQQVSGPLGGRPTLPILGNLLLQVADGTLSLTG... 0\n", - "... ... ... ...\n", - "2130 DISULFIDE_2131 DSLDEQRSRYAQIKQAWDNRQMDVVEQMMPGLKDYPLYPYLEYRQI... 1\n", - "2132 DISULFIDE_2133 SRTHVCQSDTHIFIIMGASGDLAKKKIYPTIWWLFRDGLLPENTFI... 1\n", - "2134 DISULFIDE_2135 ATTYNAVVSKSSSDGKTFKTIADAIASAPAGSTPFVILIKNGVYNE... 1\n", - "2137 DISULFIDE_2138 KQFSQEFRDGYSILKHYGGNGPYSERVSYGIARDPPTSCEVDQVIM... 1\n", - "2147 DISULFIDE_2148 MRNRREVSKLLSERVLLLDGAYGTEFMKYGYDDLPEELNIKAPDVV... 1\n", - "\n", - "[200 rows x 3 columns]\n" - ] + "data": { + "text/plain": " SUEM840101 NISK860101 KANM800101 CHOP780101 MIYS990105 FAUJ880103 QIAN880126 MUNV940105 LINS030104 JOND920101\nAA \nA 0.788 0.406 0.875 0.174 0.492 0.124 0.451 0.175 0.093 0.818\nC 0.544 0.906 0.312 0.661 0.016 0.301 0.324 0.089 0.000 0.078\nD 0.146 0.006 0.542 0.908 0.825 0.344 0.745 0.337 0.588 0.494\nE 0.622 0.055 1.000 0.248 0.857 0.468 0.471 0.182 0.804 0.623\nF 0.813 0.968 0.552 0.119 0.000 0.729 0.186 0.066 0.082 0.338\nG 0.000 0.262 0.115 1.000 0.492 0.000 0.676 0.393 0.144 0.779\nH 0.425 0.559 0.615 0.440 0.492 0.577 0.696 0.125 0.423 0.117\nI 0.901 1.000 0.583 0.000 0.079 0.495 0.314 0.050 0.010 0.506\nK 0.571 0.000 0.729 0.495 1.000 0.590 0.088 0.155 1.000 0.584\nL 0.901 0.942 0.719 0.110 0.016 0.495 0.059 0.152 0.041 1.000\nM 1.000 0.788 0.969 0.119 0.127 0.548 0.000 0.083 0.082 0.130\nN 0.317 0.157 0.385 1.000 0.683 0.365 0.608 0.244 0.557 0.377\nP 0.112 0.118 0.000 0.963 0.698 0.337 0.873 1.000 0.454 0.481\nQ 0.634 0.145 0.646 0.468 0.762 0.489 0.471 0.152 0.711 0.351\nR 0.726 0.333 0.500 0.440 0.651 0.759 0.529 0.119 0.845 0.481\nS 0.278 0.137 0.229 0.881 0.698 0.198 0.931 0.109 0.309 0.714\nT 0.371 0.305 0.302 0.450 0.603 0.322 1.000 0.040 0.330 0.584\nV 0.589 0.884 0.438 0.028 0.159 0.371 0.510 0.000 0.031 0.675\nW 0.847 0.961 0.469 0.450 0.095 1.000 0.196 0.076 0.206 0.000\nY 0.704 0.649 0.281 0.615 0.159 0.801 0.853 0.036 0.268 0.234", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
SUEM840101NISK860101KANM800101CHOP780101MIYS990105FAUJ880103QIAN880126MUNV940105LINS030104JOND920101
AA
A0.7880.4060.8750.1740.4920.1240.4510.1750.0930.818
C0.5440.9060.3120.6610.0160.3010.3240.0890.0000.078
D0.1460.0060.5420.9080.8250.3440.7450.3370.5880.494
E0.6220.0551.0000.2480.8570.4680.4710.1820.8040.623
F0.8130.9680.5520.1190.0000.7290.1860.0660.0820.338
G0.0000.2620.1151.0000.4920.0000.6760.3930.1440.779
H0.4250.5590.6150.4400.4920.5770.6960.1250.4230.117
I0.9011.0000.5830.0000.0790.4950.3140.0500.0100.506
K0.5710.0000.7290.4951.0000.5900.0880.1551.0000.584
L0.9010.9420.7190.1100.0160.4950.0590.1520.0411.000
M1.0000.7880.9690.1190.1270.5480.0000.0830.0820.130
N0.3170.1570.3851.0000.6830.3650.6080.2440.5570.377
P0.1120.1180.0000.9630.6980.3370.8731.0000.4540.481
Q0.6340.1450.6460.4680.7620.4890.4710.1520.7110.351
R0.7260.3330.5000.4400.6510.7590.5290.1190.8450.481
S0.2780.1370.2290.8810.6980.1980.9310.1090.3090.714
T0.3710.3050.3020.4500.6030.3221.0000.0400.3300.584
V0.5890.8840.4380.0280.1590.3710.5100.0000.0310.675
W0.8470.9610.4690.4500.0951.0000.1960.0760.2060.000
Y0.7040.6490.2810.6150.1590.8010.8530.0360.2680.234
\n
" + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# Load training data\n", - "df_info = aa.load_dataset()\n", - "df = aa.load_dataset(name=\"SEQ_DISULFIDE\", min_len=300, n=100)\n", - "print(df)\n", - "# Load scales and scale categories from AAanalysis\n", - "df_scales = aa.load_scales()\n", - "df_cat = aa.load_scales(name=\"scales_cat\")\n", - "# Select scales using AAclust\n", + "from sklearn.cluster import AgglomerativeClustering\n", + "import numpy as np\n", "aac = aa.AAclust(model=AgglomerativeClustering, model_kwargs=dict(linkage=\"ward\"))\n", "X = np.array(df_scales).T\n", - "scales = aac.fit(X, n_clusters=10, names=list(df_scales)) # Number of clusters = number of selected scales (100 is recommended)\n", - "df_cat = df_cat[df_cat[\"scale_id\"].isin(scales)]\n", - "df_scales = df_scales[scales]" - ] + "scales = aac.fit(X, n_clusters=10, names=list(df_scales)) \n", + "df_scales = df_scales[scales]\n", + "df_scales" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-09-23T14:15:07.346288614Z", + "start_time": "2023-09-23T14:15:07.280822767Z" + } + } }, { "cell_type": "markdown", + "source": [ + "### Comparative Physicochemical Profiling (CPP)\n", + " CPP is a sequence-based feature engineering algorithm. It aims at identifying a set of features most discriminant between two sets of sequences: the test set and the reference set. Supported by the ``SequenceFeature`` object (``sf``), A CPP feature integrates:\n", + " \n", + "- ``Parts``: Are combination of a target middle domain (TMD) and N- and C-terminal adjacent regions (JMD-N and JMD-C, respectively), obtained ``sf.get_df_parts()``.\n", + "- ``Splits``: These `Parts` can be split into various continuous segments or discontinuous patterns, specified ``sf.get_split_kws()``. \n", + "- ``Scales``: Sets of amino acid scales." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 10, "metadata": { "pycharm": { - "name": "#%% md\n" + "name": "#%%\n" + }, + "ExecuteTime": { + "end_time": "2023-09-23T14:15:11.021981566Z", + "start_time": "2023-09-23T14:15:10.913819589Z" } }, + "outputs": [ + { + "data": { + "text/plain": " tmd jmd_n_tmd_n tmd_c_jmd_c\nQ14802 LQVGGLICAGVLCAMGIIIVMSA NSPFYYDWHSLQVGGLICAGVL CAMGIIIVMSAKCKCKFGQKS\nQ86UE4 WVILVGTGALGLLLLFLLGYGWA LGLEPKRYPGWVILVGTGALGL LLLFLLGYGWAAACAGARKKR\nQ969W9 FVQIIIIVVVMMVMVVVITCLLS FQSMEITELEFVQIIIIVVVMM VMVVVITCLLSHYKLSARSFI\nP53801 ALIITMSVVGGTLLLGIAICCCC RWGVCWVNFEALIITMSVVGGT LLLGIAICCCCCCRRKRSRKP\nQ8IUW5 IAYALVPVFFIMGLFGVLICHLL NDTGNGHPEYIAYALVPVFFIM GLFGVLICHLLKKKGYRCTTE\n... ... ... ...\nP15209 SVYAVVVIASVVGFCLLVMLLLL VADQSNREHLSVYAVVVIASVV GFCLLVMLLLLKLARHSKFGM\nQ86YL7 TLVGIIVGVLLAIGFIGAIIVVV TVEKDGLSTVTLVGIIVGVLLA IGFIGAIIVVVMRKMSGRYSP\nQ13308 TIGLSVGAAVAYIIAVLGLMFYC GSPPPYKMIQTIGLSVGAAVAY IIAVLGLMFYCKKRCKAKRLQ\nP10586 LWVTGPVLAVILIILIVIAILLF PAQQQEEPEMLWVTGPVLAVIL IILIVIAILLFKRKRTHSPSS\nP28828 IAGVIAGILLFVIIFLGVVLVM PEKQTDHTVKIAGVIAGILLF VIIFLGVVLVMKKRKLAKKRK\n\n[100 rows x 3 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
tmdjmd_n_tmd_ntmd_c_jmd_c
Q14802LQVGGLICAGVLCAMGIIIVMSANSPFYYDWHSLQVGGLICAGVLCAMGIIIVMSAKCKCKFGQKS
Q86UE4WVILVGTGALGLLLLFLLGYGWALGLEPKRYPGWVILVGTGALGLLLLFLLGYGWAAACAGARKKR
Q969W9FVQIIIIVVVMMVMVVVITCLLSFQSMEITELEFVQIIIIVVVMMVMVVVITCLLSHYKLSARSFI
P53801ALIITMSVVGGTLLLGIAICCCCRWGVCWVNFEALIITMSVVGGTLLLGIAICCCCCCRRKRSRKP
Q8IUW5IAYALVPVFFIMGLFGVLICHLLNDTGNGHPEYIAYALVPVFFIMGLFGVLICHLLKKKGYRCTTE
............
P15209SVYAVVVIASVVGFCLLVMLLLLVADQSNREHLSVYAVVVIASVVGFCLLVMLLLLKLARHSKFGM
Q86YL7TLVGIIVGVLLAIGFIGAIIVVVTVEKDGLSTVTLVGIIVGVLLAIGFIGAIIVVVMRKMSGRYSP
Q13308TIGLSVGAAVAYIIAVLGLMFYCGSPPPYKMIQTIGLSVGAAVAYIIAVLGLMFYCKKRCKAKRLQ
P10586LWVTGPVLAVILIILIVIAILLFPAQQQEEPEMLWVTGPVLAVILIILIVIAILLFKRKRTHSPSS
P28828IAGVIAGILLFVIIFLGVVLVMPEKQTDHTVKIAGVIAGILLFVIIFLGVVLVMKKRKLAKKRK
\n

100 rows × 3 columns

\n
" + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Feature Engineering\n", - "*Comparative Physicochemical Profiling (CPP)* is a sequence-based feature engineering algorithm aiming at identifying a set of features that is most discriminant between two sets of sequences, called test set and reference set. A CPP feature is a combination of a *Part*, a *Split*, and a *Scale*. *Parts* are combination of a target middle domain (TMD) and N- and C-terminal adjacent regions (JMD-N and JMD-C, respectively). They can be obtained from a dataframe with sequences using the sf.get_df_parts() method from the SequenceFeature class, where the length of the JMDs can be specified. These *Parts* can be split into various continuous segments or discontinuous patterns, for which the sf.get_split_kws() method creates a parameter dictionary. The scales (and or scale categories), the parts, and the split parameters are used to instantiates the CPP class. Running the CPP algorithm creates all *Part*, *Split*, *Split* combinations and filters a selected maximum (via 'n_filter' argument) of non-redundant features." + "y = list(df_seq[\"label\"])\n", + "sf = aa.SequenceFeature()\n", + "df_parts = sf.get_df_parts(df_seq=df_seq, jmd_n_len=10, jmd_c_len=10)\n", + "split_kws = sf.get_split_kws(n_split_max=1, split_types=[\"Segment\"])\n", + "df_parts" ] }, { - "cell_type": "code", - "execution_count": 3, + "cell_type": "markdown", + "source": [ + "Running the CPP algorithm creates all `Part`, `Split`, `Split` combinations and filters a selected maximum of non-redundant features:" + ], "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 12, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1. CPP creates 30 features for 200 samples\n", - " |#########################| 100.00%\n", + "1. CPP creates 30 features for 100 samples\n", + " |#########################| 100.00%\u001B[0m91mm\n", "2. CPP pre-filters 1 features (5%) with highest 'abs_mean_dif' and 'max_std_test' <= 0.2\n", - "3. CPP filtering algorithm\n", - "4. CPP returns df with 1 unique features including general information and statistics\n" + "3. CPP filtering algorithm\n" + ] + }, + { + "ename": "ValueError", + "evalue": "'jmd_n_seq' should be string (type=)", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mValueError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[12], line 3\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;66;03m# Small set of features (300 features created)\u001B[39;00m\n\u001B[1;32m 2\u001B[0m cpp \u001B[38;5;241m=\u001B[39m aa\u001B[38;5;241m.\u001B[39mCPP(df_parts\u001B[38;5;241m=\u001B[39mdf_parts, df_cat\u001B[38;5;241m=\u001B[39mdf_cat, df_scales\u001B[38;5;241m=\u001B[39mdf_scales, split_kws\u001B[38;5;241m=\u001B[39msplit_kws)\n\u001B[0;32m----> 3\u001B[0m df_feat \u001B[38;5;241m=\u001B[39m \u001B[43mcpp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrun\u001B[49m\u001B[43m(\u001B[49m\u001B[43mlabels\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43my\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtmd_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m20\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mn_filter\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m100\u001B[39;49m\u001B[43m)\u001B[49m \n", + "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/cpp/cpp.py:310\u001B[0m, in \u001B[0;36mCPP.run\u001B[0;34m(self, labels, parametric, n_filter, tmd_len, jmd_n_len, jmd_c_len, ext_len, start, check_cat, n_pre_filter, pct_pre_filter, max_std_test, max_overlap, max_cor, n_processes)\u001B[0m\n\u001B[1;32m 308\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_verbose:\n\u001B[1;32m 309\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m3. CPP filtering algorithm\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m--> 310\u001B[0m df \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_add_positions\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdf_feat\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdf\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtmd_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtmd_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstart\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mstart\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 311\u001B[0m df \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_add_scale_info(df_feat\u001B[38;5;241m=\u001B[39mdf)\n\u001B[1;32m 312\u001B[0m df_feat \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_filtering(df\u001B[38;5;241m=\u001B[39mdf, n_filter\u001B[38;5;241m=\u001B[39mn_filter, check_cat\u001B[38;5;241m=\u001B[39mcheck_cat, max_overlap\u001B[38;5;241m=\u001B[39mmax_overlap, max_cor\u001B[38;5;241m=\u001B[39mmax_cor)\n", + "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/cpp/cpp.py:161\u001B[0m, in \u001B[0;36mCPP._add_positions\u001B[0;34m(df_feat, tmd_len, jmd_n_len, jmd_c_len, ext_len, start)\u001B[0m\n\u001B[1;32m 159\u001B[0m features \u001B[38;5;241m=\u001B[39m df_feat[ut\u001B[38;5;241m.\u001B[39mCOL_FEATURE]\u001B[38;5;241m.\u001B[39mto_list()\n\u001B[1;32m 160\u001B[0m sf \u001B[38;5;241m=\u001B[39m SequenceFeature()\n\u001B[0;32m--> 161\u001B[0m feat_positions \u001B[38;5;241m=\u001B[39m \u001B[43msf\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43madd_position\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfeatures\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mfeatures\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtmd_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtmd_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstart\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mstart\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 162\u001B[0m \u001B[43m \u001B[49m\u001B[43mjmd_n_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mjmd_n_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mjmd_c_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mjmd_c_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mext_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mext_len\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 163\u001B[0m df_feat[ut\u001B[38;5;241m.\u001B[39mCOL_POSITION] \u001B[38;5;241m=\u001B[39m feat_positions\n\u001B[1;32m 164\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m df_feat\n", + "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/cpp/feature.py:614\u001B[0m, in \u001B[0;36mSequenceFeature.add_position\u001B[0;34m(df_feat, features, start, tmd_len, jmd_n_len, jmd_c_len, ext_len, part_split)\u001B[0m\n\u001B[1;32m 612\u001B[0m ut\u001B[38;5;241m.\u001B[39mcheck_non_negative_number(name\u001B[38;5;241m=\u001B[39mname, val\u001B[38;5;241m=\u001B[39margs[name], just_int\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m, min_val\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m0\u001B[39m)\n\u001B[1;32m 613\u001B[0m sfp \u001B[38;5;241m=\u001B[39m SequenceFeaturePositions()\n\u001B[0;32m--> 614\u001B[0m dict_part_pos \u001B[38;5;241m=\u001B[39m \u001B[43msfp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_dict_part_pos\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtmd_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtmd_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 615\u001B[0m feat_positions \u001B[38;5;241m=\u001B[39m sfp\u001B[38;5;241m.\u001B[39mget_positions(dict_part_pos\u001B[38;5;241m=\u001B[39mdict_part_pos, features\u001B[38;5;241m=\u001B[39mfeatures)\n\u001B[1;32m 616\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m feat_positions\n", + "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/cpp/_feature_pos.py:52\u001B[0m, in \u001B[0;36mSequenceFeaturePositions.get_dict_part_pos\u001B[0;34m(tmd_len, jmd_n_len, jmd_c_len, ext_len, start)\u001B[0m\n\u001B[1;32m 50\u001B[0m tmd \u001B[38;5;241m=\u001B[39m [i \u001B[38;5;241m+\u001B[39m start \u001B[38;5;28;01mfor\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m tmd]\n\u001B[1;32m 51\u001B[0m jmd_c \u001B[38;5;241m=\u001B[39m [i \u001B[38;5;241m+\u001B[39m start \u001B[38;5;28;01mfor\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m jmd_c]\n\u001B[0;32m---> 52\u001B[0m dict_part_pos \u001B[38;5;241m=\u001B[39m \u001B[43mpa\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_dict_part_seq\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtmd_seq\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtmd\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mjmd_n_seq\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mjmd_n\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mjmd_c_seq\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mjmd_c\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mext_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mext_len\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 53\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m dict_part_pos\n", + "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/cpp/_part.py:190\u001B[0m, in \u001B[0;36mParts.get_dict_part_seq\u001B[0;34m(df, entry, tmd_seq, jmd_n_seq, jmd_c_seq, ext_len)\u001B[0m\n\u001B[1;32m 188\u001B[0m tmd_seq, jmd_n_seq, jmd_c_seq \u001B[38;5;241m=\u001B[39m _get_parts_from_df(df\u001B[38;5;241m=\u001B[39mdf, entry\u001B[38;5;241m=\u001B[39mentry)\n\u001B[1;32m 189\u001B[0m check_parts(tmd\u001B[38;5;241m=\u001B[39mtmd_seq, jmd_n\u001B[38;5;241m=\u001B[39mjmd_n_seq, jmd_c\u001B[38;5;241m=\u001B[39mjmd_c_seq)\n\u001B[0;32m--> 190\u001B[0m \u001B[43mut\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcheck_args_len\u001B[49m\u001B[43m(\u001B[49m\u001B[43mjmd_n_seq\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mjmd_n_seq\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mjmd_c_seq\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mjmd_c_seq\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mext_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mext_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43maccept_tmd_none\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m)\u001B[49m\n\u001B[1;32m 191\u001B[0m dict_part_seq \u001B[38;5;241m=\u001B[39m _get_dict_part_seq_from_seq(tmd\u001B[38;5;241m=\u001B[39mtmd_seq, jmd_n\u001B[38;5;241m=\u001B[39mjmd_n_seq, jmd_c\u001B[38;5;241m=\u001B[39mjmd_c_seq, ext_len\u001B[38;5;241m=\u001B[39mext_len)\n\u001B[1;32m 192\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m dict_part_seq\n", + "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/_utils/utils_cpp.py:119\u001B[0m, in \u001B[0;36mcheck_args_len\u001B[0;34m(tmd_len, jmd_n_len, jmd_c_len, ext_len, tmd_seq, jmd_n_seq, jmd_c_seq, accept_tmd_none)\u001B[0m\n\u001B[1;32m 117\u001B[0m \u001B[38;5;66;03m# Check if lengths and sequences match\u001B[39;00m\n\u001B[1;32m 118\u001B[0m tmd_len \u001B[38;5;241m=\u001B[39m _check_seq(tmd_seq, tmd_len, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtmd_seq\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtmd_len\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m--> 119\u001B[0m jmd_n_len \u001B[38;5;241m=\u001B[39m \u001B[43m_check_seq\u001B[49m\u001B[43m(\u001B[49m\u001B[43mjmd_n_seq\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mjmd_n_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mjmd_n_seq\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mjmd_n_len\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m 120\u001B[0m jmd_c_len \u001B[38;5;241m=\u001B[39m _check_seq(jmd_c_seq, jmd_c_len, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mjmd_c_seq\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mjmd_c_len\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 121\u001B[0m \u001B[38;5;66;03m# Check if lengths are matching\u001B[39;00m\n", + "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/_utils/utils_cpp.py:100\u001B[0m, in \u001B[0;36m_check_seq\u001B[0;34m(seq, len_, name_seq, name_len)\u001B[0m\n\u001B[1;32m 98\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 99\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(seq, \u001B[38;5;28mstr\u001B[39m):\n\u001B[0;32m--> 100\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mname_seq\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m should be string (type=\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mtype\u001B[39m(seq)\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m)\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 101\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m len_ \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m 102\u001B[0m \u001B[38;5;66;03m# Waring sequence length doesn't match the corresponding length parameter\u001B[39;00m\n\u001B[1;32m 103\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(seq) \u001B[38;5;241m<\u001B[39m len_:\n", + "\u001B[0;31mValueError\u001B[0m: 'jmd_n_seq' should be string (type=)" ] } ], "source": [ - "# Feature Engineering\n", - "y = list(df[\"label\"])\n", - "sf = aa.SequenceFeature()\n", - "df_parts = sf.get_df_parts(df_seq=df, jmd_n_len=50, jmd_c_len=50)\n", - "args = dict(df_scales=df_scales, df_parts=df_parts, accept_gaps=True)\n", "# Small set of features (300 features created)\n", - "split_kws = sf.get_split_kws(n_split_max=1, split_types=[\"Segment\"])\n", - "cpp = aa.CPP(df_cat=df_cat, **args, split_kws=split_kws)\n", - "df_feat = cpp.run(labels=y, tmd_len=200, n_processes=8, n_filter=100)" - ] + "cpp = aa.CPP(df_parts=df_parts, df_cat=df_cat, df_scales=df_scales, split_kws=split_kws)\n", + "df_feat = cpp.run(labels=y, tmd_len=20, n_filter=100) " + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-09-23T14:22:06.231449367Z", + "start_time": "2023-09-23T14:21:58.054329311Z" + } + } }, { "cell_type": "markdown", @@ -130,43 +207,74 @@ } }, "source": [ - "# Machine learning\n", - "The SequenceFeature class provides as well a method to create a feature matrix from a given set of CPP features." + "## 3. Protein Prediction\n", + "A feature matrix from a given set of CPP features can be created using ``sf.feat_matrix``:" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "execution_count": 13, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Mean accuracy of 0.55\n" + "ename": "NameError", + "evalue": "name 'df_feat' is not defined", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[13], line 3\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01msklearn\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mensemble\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m RandomForestClassifier\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01msklearn\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mmodel_selection\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m cross_val_score\n\u001B[0;32m----> 3\u001B[0m X \u001B[38;5;241m=\u001B[39m sf\u001B[38;5;241m.\u001B[39mfeat_matrix(df_parts\u001B[38;5;241m=\u001B[39mdf_parts, df_scales\u001B[38;5;241m=\u001B[39mdf_scales, features\u001B[38;5;241m=\u001B[39m\u001B[43mdf_feat\u001B[49m[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mfeature\u001B[39m\u001B[38;5;124m\"\u001B[39m])\n", + "\u001B[0;31mNameError\u001B[0m: name 'df_feat' is not defined" ] } ], "source": [ - "X = sf.feat_matrix(**args, features=df_feat[\"feature\"])\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import cross_val_score\n", + "X = sf.feat_matrix(df_parts=df_parts, df_scales=df_scales, features=df_feat[\"feature\"])" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-09-23T14:22:13.569330952Z", + "start_time": "2023-09-23T14:22:13.527712901Z" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "This feature matrix can now be used for common machine learning models." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ "# ML evaluation\n", "rf = RandomForestClassifier()\n", "cv = cross_val_score(rf, X, y, scoring=\"accuracy\", cv=5, n_jobs=8) # Set n_jobs=1 to disable multi-processing\n", "print(f\"Mean accuracy of {round(np.mean(cv), 2)}\")" - ] + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Creating more initial features will take some more time but improve prediction performance. " + ], + "metadata": { + "collapsed": false + } }, { "cell_type": "code", "execution_count": 5, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, "outputs": [ { "name": "stdout", @@ -182,16 +290,32 @@ } ], "source": [ - "# Default set of features (around 100.000 features created)\n", + "# Default split settings for features (around 100.000 features created)\n", "split_kws = sf.get_split_kws()\n", - "cpp = aa.CPP(df_cat=df_cat, **args, split_kws=split_kws)\n", + "cpp = aa.CPP(df_cat=df_cat, df_parts=df_parts, df_scales=df_scales, split_kws=split_kws)\n", "df_feat = cpp.run(labels=y, tmd_len=200, n_processes=8, n_filter=100)\n", - "X = sf.feat_matrix(**args, features=df_feat[\"feature\"])\n", + "X = sf.feat_matrix(df_parts=df_parts, df_scales=df_scales, features=df_feat[\"feature\"])\n", "# ML evaluation\n", "rf = RandomForestClassifier()\n", "cv = cross_val_score(rf, X, y, scoring=\"accuracy\", cv=5, n_jobs=1) # Set n_jobs=1 to disable multi-processing\n", "print(f\"Mean accuracy of {round(np.mean(cv), 2)}\")" - ] + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## 4. Explainable AI\n", + "\n", + "### Explainable AI on group level\n", + "\n", + "### Explainable AI on individual level" + ], + "metadata": { + "collapsed": false + } } ], "metadata": { @@ -215,4 +339,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +}