diff --git a/aaanalysis/__pycache__/utils.cpython-39.pyc b/aaanalysis/__pycache__/utils.cpython-39.pyc
index 7bd85d19..ce7d9e8b 100644
Binary files a/aaanalysis/__pycache__/utils.cpython-39.pyc and b/aaanalysis/__pycache__/utils.cpython-39.pyc differ
diff --git a/aaanalysis/_utils/__init__.py b/aaanalysis/_utils/__init__.py
index e69de29b..8b137891 100644
--- a/aaanalysis/_utils/__init__.py
+++ b/aaanalysis/_utils/__init__.py
@@ -0,0 +1 @@
+
diff --git a/aaanalysis/_utils/__pycache__/__init__.cpython-39.pyc b/aaanalysis/_utils/__pycache__/__init__.cpython-39.pyc
index ec36d995..a61deb51 100644
Binary files a/aaanalysis/_utils/__pycache__/__init__.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc b/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc
index 07d15af7..1c0a77f9 100644
Binary files a/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/_utils_check.cpython-39.pyc differ
diff --git a/aaanalysis/_utils/__pycache__/_utils_constants.cpython-39.pyc b/aaanalysis/_utils/__pycache__/_utils_constants.cpython-39.pyc
deleted file mode 100644
index 7bd6f4e7..00000000
Binary files a/aaanalysis/_utils/__pycache__/_utils_constants.cpython-39.pyc and /dev/null differ
diff --git a/aaanalysis/_utils/__pycache__/_utils_output.cpython-39.pyc b/aaanalysis/_utils/__pycache__/_utils_output.cpython-39.pyc
index 98d80c94..f93d2f5d 100644
Binary files a/aaanalysis/_utils/__pycache__/_utils_output.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/_utils_output.cpython-39.pyc differ
diff --git a/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc b/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc
index 59668b2c..e38ecd09 100644
Binary files a/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/utils_aaclust.cpython-39.pyc differ
diff --git a/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc b/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc
index 6b292095..b7919e8f 100644
Binary files a/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc and b/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc differ
diff --git a/aaanalysis/_utils/_utils_constants.py b/aaanalysis/_utils/_utils_constants.py
deleted file mode 100644
index 30b103e1..00000000
--- a/aaanalysis/_utils/_utils_constants.py
+++ /dev/null
@@ -1,91 +0,0 @@
-"""
-This is a script containing constant names, column names, or colors.
-
-AAanalysis comprises these primary pd.DataFrames: df_seq, df_part, df_cat, df_scales, df_feat
-
-"""
-
-# Default scale datasets for protein analysis
-STR_SCALES = "scales"   # Min-max normalized scales (from AAontology)
-STR_SCALES_RAW = "scales_raw"   # Raw scales (from AAontology)
-STR_SCALES_PC = "scales_pc"     # AAclust pc-based scales (pc: principal component)
-STR_SCALE_CAT = "scales_cat"  # AAontology
-STR_TOP60 = "top60"    # AAclustTop60
-STR_TOP60_EVAL = "top60_eval"  # AAclustTop60 evaluation
-NAMES_SCALE_SETS = [STR_SCALES, STR_SCALES_RAW, STR_SCALE_CAT,
-                    STR_SCALES_PC, STR_TOP60, STR_TOP60_EVAL]
-
-
-# Column names for primary df
-# df_seq
-COL_ENTRY = "entry"     # ACC, protein entry, uniprot id
-COL_NAME = "name"       # Entry name, Protein name, Uniprot Name
-COL_LABEL = "label"
-COL_SEQ = "sequence"
-COLS_PARTS = ["jmd_n", "tmd", "jmd_c"]
-COL_TMD_START = "tmd_start"
-COL_TMD_STOP = "tmd_stop"
-COLS_SEQ_KEY = [COL_ENTRY, COL_SEQ, COL_LABEL]
-COLS_SEQ_TMD_POS_KEY = [COL_SEQ, COL_TMD_START, COL_TMD_STOP]  # TODO adjust to COL_ENTRY
-COLS_SEQ_TMD_PART_KEY = [COL_ENTRY, COL_SEQ] + COLS_PARTS
-# df_part
-
-# df_scales
-# Column for df_cat (as defined in AAontology, retrieved by aa.load_scales(name="scale_cat"))
-COL_SCALE_ID = "scale_id"
-COL_CAT = "category"
-COL_SUBCAT = "subcategory"
-COL_SCALE_NAME = "scale_name"
-COL_SCALE_DES = "scale_description"
-
-
-# Columns for df_feat
-COL_FEATURE = "feature"
-# COL_CAT, COL_SUBCAT, COL_SCALE_NAME, COL_SCALE_DES
-COL_ABS_AUC = "abs_auc"
-COL_ABS_MEAN_DIF = "abs_mean_dif"
-COL_MEAN_DIF = "mean_dif"
-COL_STD_TEST = "std_test"
-COL_STD_REF = "std_ref"
-COL_PVAL_MW = "p_val_mann_whitney"
-COL_PVAL_FDR = "p_val_fdr_bh"
-COL_POSITION = "positions"
-
-# Columns for df_feat after processing with explainable AI methods
-COL_FEAT_IMPORTANCE = "feat_importance"
-COO_FEAT_IMP_STD = "feat_importance_std"
-COL_FEAT_IMPACT = "feat_impact"
-
-
-# Column name datasets (DOM_GSEC)
-
-
-
-# Column names cpp features
-
-
-# Standard colors
-COLOR_SHAP_POS = '#FF0D57'  # (255, 13, 87)
-COLOR_SHAP_NEG = '#1E88E5'  # (30, 136, 229)
-COLOR_FEAT_POS = '#9D2B39'  # (157, 43, 57) Mean difference
-COLOR_FEAT_NEG = '#326599'  # (50, 101, 133) Mean difference
-COLOR_FEAT_IMP = '#7F7F7F'  # (127, 127, 127) feature importance
-COLOR_TMD = '#00FA9A'       # (0, 250, 154)
-COLOR_JMD = '#0000FF'       # (0, 0, 255)
-
-DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS,
-              "SHAP_NEG": COLOR_SHAP_NEG,
-              "FEAT_POS": COLOR_FEAT_POS,
-              "FEAT_NEG": COLOR_FEAT_NEG,
-              "FEAT_IMP": COLOR_FEAT_IMP,
-              "TMD": COLOR_TMD,
-              "JMD": COLOR_JMD}
-
-DICT_COLOR_CAT = {"ASA/Volume": "tab:blue",
-                  "Composition": "tab:orange",
-                  "Conformation": "tab:green",
-                  "Energy": "tab:red",
-                  "Others": "tab:gray",
-                  "Polarity": "gold",
-                  "Shape": "tab:cyan",
-                  "Structure-Activity": "tab:brown"}
\ No newline at end of file
diff --git a/aaanalysis/_utils/utils_aaclust.py b/aaanalysis/_utils/utils_aaclust.py
index 01038ca6..26c45173 100644
--- a/aaanalysis/_utils/utils_aaclust.py
+++ b/aaanalysis/_utils/utils_aaclust.py
@@ -7,7 +7,6 @@
 METRIC_CORRELATION = "correlation"
 LIST_METRICS = [METRIC_CORRELATION, "manhattan",  "euclidean", "cosine"]
 
-
 # Check functions
 def check_model(model=None, model_kwargs=None, except_None=True):
     """"""
diff --git a/aaanalysis/_utils/utils_cpp.py b/aaanalysis/_utils/utils_cpp.py
index d9ca36ae..e701eb11 100644
--- a/aaanalysis/_utils/utils_cpp.py
+++ b/aaanalysis/_utils/utils_cpp.py
@@ -2,13 +2,10 @@
 This is a script with utility functions and settings for CPP project.
 """
 import numpy as np
-import pandas as pd
 import matplotlib.colors as mcolors
 import matplotlib.pyplot as plt
 
-import aaanalysis._utils._utils_constants as ut_c
 import aaanalysis._utils._utils_check as ut_check
-import aaanalysis._utils._utils_output as ut_o
 
 # Settings
 
@@ -104,7 +101,7 @@ def _check_seq(seq, len_, name_seq, name_len):
                 raise ValueError(f"The length of {seq} ({len(seq)}) should be >= {name_len} ({len_}).")
         return len(seq)
 
-
+# TODO split in two (separation of concerns)
 def check_args_len(tmd_len=None, jmd_n_len=None, jmd_c_len=None, ext_len=None,
                    tmd_seq=None, jmd_n_seq=None, jmd_c_seq=None, accept_tmd_none=False):
     """Check length parameters and if they are matching with sequences if provided"""
@@ -116,6 +113,7 @@ def check_args_len(tmd_len=None, jmd_n_len=None, jmd_c_len=None, ext_len=None,
     ut_check.check_non_negative_number(name="ext_len", val=ext_len, accept_none=True)
     # Check if lengths and sequences match
     tmd_len = _check_seq(tmd_seq, tmd_len, "tmd_seq", "tmd_len")
+    print(jmd_n_seq)
     jmd_n_len = _check_seq(jmd_n_seq, jmd_n_len, "jmd_n_seq", "jmd_n_len")
     jmd_c_len = _check_seq(jmd_c_seq, jmd_c_len, "jmd_c_seq", "jmd_c_len")
     # Check if lengths are matching
@@ -269,7 +267,7 @@ def check_split(split=None):
 
 # Scale functions
 def get_dict_all_scales(df_scales=None):
-    """Get nested dictionary where each scales is a key for a amino acid scale value dictionary"""
+    """Get nested dictionary where each scale is a key for an amino acid scale value dictionary"""
     dict_all_scales = {col: dict(zip(df_scales.index.to_list(), df_scales[col])) for col in list(df_scales)}
     return dict_all_scales
 
diff --git a/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc b/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc
index 192664dc..27cd1216 100644
Binary files a/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc and b/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc differ
diff --git a/aaanalysis/cpp/feature.py b/aaanalysis/cpp/feature.py
index 1dc1e13c..9f6df6a0 100644
--- a/aaanalysis/cpp/feature.py
+++ b/aaanalysis/cpp/feature.py
@@ -164,7 +164,7 @@ def _feature_matrix(feat_names, dict_all_scales, df_parts, accept_gaps):
                                            accept_gaps=accept_gaps)
     return feat_matrix
     
-    
+
 # II Main Functions
 class SequenceFeature:
     """Retrieve and create sequence feature components (Part, Split, and Scale).
diff --git a/aaanalysis/utils.py b/aaanalysis/utils.py
index 0a3f6736..d499a5ef 100644
--- a/aaanalysis/utils.py
+++ b/aaanalysis/utils.py
@@ -4,16 +4,22 @@
 import os
 import platform
 from functools import lru_cache
+import pandas as pd
+import numpy as np
 
-# Import utility functions for specific purposes
-from aaanalysis._utils._utils_constants import *
-from aaanalysis._utils._utils_check import *
-from aaanalysis._utils._utils_output import *
-
-# Import utility function for specific modules
-from aaanalysis._utils.utils_aaclust import *
-from aaanalysis._utils.utils_cpp import *
-
+# Import utility functions explicitly
+from aaanalysis._utils._utils_check import (check_non_negative_number, check_float, check_str, check_bool,
+                                            check_dict, check_tuple,
+                                            check_feat_matrix, check_col_in_df)
+from aaanalysis._utils._utils_output import (print_red, print_start_progress, print_progress, print_finished_progress)
+from aaanalysis._utils.utils_aaclust import (check_model, check_min_th, check_merge_metric,
+                                             METRIC_CORRELATION, LIST_METRICS)
+from aaanalysis._utils.utils_cpp import (check_color, check_y_categorical, check_labels, check_ylim,
+                                         check_args_len, check_list_parts, check_split_kws, check_split,
+                                         get_dict_all_scales, get_vf_scale,
+                                         STR_SEGMENT, STR_PATTERN, STR_PERIODIC_PATTERN, STR_AA_GAP,
+                                         LIST_PARTS, LIST_ALL_PARTS, SPLIT_DESCRIPTION)
+#from aaanalysis.utils.utils_dpulearn import ()
 
 # I Folder structure
 def _folder_path(super_folder, folder_name):
@@ -28,7 +34,95 @@ def _folder_path(super_folder, folder_name):
 URL_DATA = "https://github.com/breimanntools/aaanalysis/tree/master/aaanalysis/data/"
 
 
+# Constants
+# Default scale datasets for protein analysis
+STR_SCALES = "scales"   # Min-max normalized scales (from AAontology)
+STR_SCALES_RAW = "scales_raw"   # Raw scales (from AAontology)
+STR_SCALES_PC = "scales_pc"     # AAclust pc-based scales (pc: principal component)
+STR_SCALE_CAT = "scales_cat"  # AAontology
+STR_TOP60 = "top60"    # AAclustTop60
+STR_TOP60_EVAL = "top60_eval"  # AAclustTop60 evaluation
+NAMES_SCALE_SETS = [STR_SCALES, STR_SCALES_RAW, STR_SCALE_CAT,
+                    STR_SCALES_PC, STR_TOP60, STR_TOP60_EVAL]
+
+
+# Column names for primary df
+# df_seq
+COL_ENTRY = "entry"     # ACC, protein entry, uniprot id
+COL_NAME = "name"       # Entry name, Protein name, Uniprot Name
+COL_LABEL = "label"
+COL_SEQ = "sequence"
+COLS_PARTS = ["jmd_n", "tmd", "jmd_c"]
+COL_TMD_START = "tmd_start"
+COL_TMD_STOP = "tmd_stop"
+COLS_SEQ_KEY = [COL_ENTRY, COL_SEQ, COL_LABEL]
+COLS_SEQ_TMD_POS_KEY = [COL_SEQ, COL_TMD_START, COL_TMD_STOP]  # TODO adjust to COL_ENTRY
+COLS_SEQ_TMD_PART_KEY = [COL_ENTRY, COL_SEQ] + COLS_PARTS
+# df_part
+
+# df_scales
+# Column for df_cat (as defined in AAontology, retrieved by aa.load_scales(name="scale_cat"))
+COL_SCALE_ID = "scale_id"
+COL_CAT = "category"
+COL_SUBCAT = "subcategory"
+COL_SCALE_NAME = "scale_name"
+COL_SCALE_DES = "scale_description"
+
+
+# Columns for df_feat
+COL_FEATURE = "feature"
+# COL_CAT, COL_SUBCAT, COL_SCALE_NAME, COL_SCALE_DES
+COL_ABS_AUC = "abs_auc"
+COL_ABS_MEAN_DIF = "abs_mean_dif"
+COL_MEAN_DIF = "mean_dif"
+COL_STD_TEST = "std_test"
+COL_STD_REF = "std_ref"
+COL_PVAL_MW = "p_val_mann_whitney"
+COL_PVAL_FDR = "p_val_fdr_bh"
+COL_POSITION = "positions"
+
+# Columns for df_feat after processing with explainable AI methods
+COL_FEAT_IMPORTANCE = "feat_importance"
+COO_FEAT_IMP_STD = "feat_importance_std"
+COL_FEAT_IMPACT = "feat_impact"
+
+
+# Column name datasets (DOM_GSEC)
+
+
+
+# Column names cpp features
+
+
+# Standard colors
+COLOR_SHAP_POS = '#FF0D57'  # (255, 13, 87)
+COLOR_SHAP_NEG = '#1E88E5'  # (30, 136, 229)
+COLOR_FEAT_POS = '#9D2B39'  # (157, 43, 57) Mean difference
+COLOR_FEAT_NEG = '#326599'  # (50, 101, 133) Mean difference
+COLOR_FEAT_IMP = '#7F7F7F'  # (127, 127, 127) feature importance
+COLOR_TMD = '#00FA9A'       # (0, 250, 154)
+COLOR_JMD = '#0000FF'       # (0, 0, 255)
+
+DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS,
+              "SHAP_NEG": COLOR_SHAP_NEG,
+              "FEAT_POS": COLOR_FEAT_POS,
+              "FEAT_NEG": COLOR_FEAT_NEG,
+              "FEAT_IMP": COLOR_FEAT_IMP,
+              "TMD": COLOR_TMD,
+              "JMD": COLOR_JMD}
+
+DICT_COLOR_CAT = {"ASA/Volume": "tab:blue",
+                  "Composition": "tab:orange",
+                  "Conformation": "tab:green",
+                  "Energy": "tab:red",
+                  "Others": "tab:gray",
+                  "Polarity": "gold",
+                  "Shape": "tab:cyan",
+                  "Structure-Activity": "tab:brown"}
+
+
 # II MAIN FUNCTIONS
+# Main Helper functions
 # Caching for data loading for better performance (data loaded ones)
 @lru_cache(maxsize=None)
 def read_excel_cached(name, index_col=None):
@@ -43,32 +137,33 @@ def read_csv_cached(name, sep=None):
     return df.copy()
 
 
+# Main check functions
 # Check key dataframes using constants and general checking functions (df_seq, df_parts, df_cat, df_scales, df_feat)
 def check_df_seq(df_seq=None, jmd_n_len=None, jmd_c_len=None):
     """Get features from df"""
     # TODO check
     if df_seq is None or not isinstance(df_seq, pd.DataFrame):
         raise ValueError("Type of 'df_seq' ({}) must be pd.DataFrame".format(type(df_seq)))
-    if ut_c.COL_ENTRY not in list(df_seq):
-        raise ValueError("'{}' must be in 'df_seq'".format(ut_c.COL_ENTRY))
-    seq_info_in_df = set(ut_c.COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq))
-    parts_in_df = set(ut_c.COLS_PARTS).issubset(set(df_seq))
-    seq_in_df = ut_c.COL_SEQ in set(df_seq)
+    if COL_ENTRY not in list(df_seq):
+        raise ValueError("'{}' must be in 'df_seq'".format(COL_ENTRY))
+    seq_info_in_df = set(COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq))
+    parts_in_df = set(COLS_PARTS).issubset(set(df_seq))
+    seq_in_df = COL_SEQ in set(df_seq)
     if "start" in list(df_seq):
-        raise ValueError(f"'df_seq' should not contain 'start' in columns. Change column to '{ut_c.COL_TMD_START}'.")
+        raise ValueError(f"'df_seq' should not contain 'start' in columns. Change column to '{COL_TMD_START}'.")
     if "stop" in list(df_seq):
-        raise ValueError(f"'df_seq' should not contain 'stop' in columns. Change column to '{ut_c.COL_TMD_STOP}'.")
+        raise ValueError(f"'df_seq' should not contain 'stop' in columns. Change column to '{COL_TMD_STOP}'.")
     if not (seq_info_in_df or parts_in_df or seq_in_df):
-        raise ValueError(f"'df_seq' should contain ['{ut_c.COL_SEQ}'], {ut_c.COLS_SEQ_TMD_POS_KEY}, or {ut_c.COLS_PARTS}")
+        raise ValueError(f"'df_seq' should contain ['{COL_SEQ}'], {COLS_SEQ_TMD_POS_KEY}, or {COLS_PARTS}")
     # Check data type in part or sequence columns
     else:
         if seq_info_in_df or seq_in_df:
-            error = f"Sequence column ('{ut_c.COL_SEQ}') should only contain strings"
-            dict_wrong_seq = {ut_c.COL_SEQ: [x for x in df_seq[ut_c.COL_SEQ].values if type(x) != str]}
+            error = f"Sequence column ('{COL_SEQ}') should only contain strings"
+            dict_wrong_seq = {COL_SEQ: [x for x in df_seq[COL_SEQ].values if type(x) != str]}
         else:
-            cols = ut_c.COLS_PARTS
+            cols = COLS_PARTS
             error = f"Part columns ('{cols}') should only contain strings"
-            dict_wrong_seq = {part: [x for x in df_seq[part].values if type(x) != str] for part in ut_c.COLS_PARTS}
+            dict_wrong_seq = {part: [x for x in df_seq[part].values if type(x) != str] for part in COLS_PARTS}
         # Filter empty lists
         dict_wrong_seq = {part: dict_wrong_seq[part] for part in dict_wrong_seq if len(dict_wrong_seq[part]) > 0}
         n_wrong_entries = sum([len(dict_wrong_seq[part]) for part in dict_wrong_seq])
@@ -78,33 +173,33 @@ def check_df_seq(df_seq=None, jmd_n_len=None, jmd_c_len=None):
     # Check if only sequence given -> Convert sequence to tmd
     if seq_in_df and not parts_in_df:
         if seq_info_in_df:
-            for entry, start, stop in zip(df_seq[ut_c.COL_ENTRY], df_seq[ut_c.COL_TMD_START], df_seq[ut_c.COL_TMD_STOP]):
-                ut_check.check_non_negative_number(name=f"tmd_start [{entry}]", val=start)
-                ut_check.check_non_negative_number(name=f"tmd_start [{entry}]", val=stop,)
-            tmd_start = [int(x) for x in df_seq[ut_c.COL_TMD_START]]
-            tmd_stop = [int(x) for x in df_seq[ut_c.COL_TMD_STOP]]
+            for entry, start, stop in zip(df_seq[COL_ENTRY], df_seq[COL_TMD_START], df_seq[COL_TMD_STOP]):
+                check_non_negative_number(name=f"tmd_start [{entry}]", val=start)
+                check_non_negative_number(name=f"tmd_start [{entry}]", val=stop,)
+            tmd_start = [int(x) for x in df_seq[COL_TMD_START]]
+            tmd_stop = [int(x) for x in df_seq[COL_TMD_STOP]]
         else:
             tmd_start = 1 if jmd_n_len is None else 1 + jmd_n_len
-            tmd_stop = [len(x)-1 for x in df_seq[ut_c.COL_SEQ]]
+            tmd_stop = [len(x)-1 for x in df_seq[COL_SEQ]]
             if jmd_c_len is not None:
                 tmd_stop = [x - jmd_c_len for x in tmd_stop]
-        df_seq[ut_c.COL_TMD_START] = tmd_start
-        df_seq[ut_c.COL_TMD_STOP] = tmd_stop
-        seq_info_in_df = set(ut_c.COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq))
+        df_seq[COL_TMD_START] = tmd_start
+        df_seq[COL_TMD_STOP] = tmd_stop
+        seq_info_in_df = set(COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq))
     # Check parameter combinations
     if [jmd_n_len, jmd_c_len].count(None) == 1:
         raise ValueError("'jmd_n_len' and 'jmd_c_len' should both be given (not None) or None")
     if not parts_in_df and seq_info_in_df and jmd_n_len is None and jmd_c_len is None:
         error = f"'jmd_n_len' and 'jmd_c_len' should not be None if " \
-                f"sequence information ({ut_c.COLS_SEQ_TMD_POS_KEY}) are given."
+                f"sequence information ({COLS_SEQ_TMD_POS_KEY}) are given."
         raise ValueError(error)
     if not seq_info_in_df and jmd_n_len is not None and jmd_c_len is not None:
-        error = f"If not all sequence information ({ut_c.COLS_SEQ_TMD_POS_KEY}) are given," \
+        error = f"If not all sequence information ({COLS_SEQ_TMD_POS_KEY}) are given," \
                 f"'jmd_n_len' and 'jmd_c_len' should be None."
         raise ValueError(error)
     if not parts_in_df and seq_info_in_df and (jmd_c_len is None or jmd_n_len is None):
         error = "If part columns ({}) are not in 'df_seq' but sequence information ({}), " \
-                "\n'jmd_n_len' and 'jmd_c_len' should be given (not None).".format(ut_c.COLS_PARTS, ut_c.COLS_SEQ_TMD_POS_KEY)
+                "\n'jmd_n_len' and 'jmd_c_len' should be given (not None).".format(COLS_PARTS, COLS_SEQ_TMD_POS_KEY)
         raise ValueError(error)
     return df_seq
 
@@ -145,17 +240,17 @@ def check_df_cat(df_cat=None, df_scales=None, accept_none=True, verbose=True):
     if not isinstance(df_cat, pd.DataFrame):
         raise ValueError("'df_cat' should be type pd.DataFrame (not {})".format(type(df_cat)))
     # Check columns
-    for col in [ut_c.COL_SCALE_ID, ut_c.COL_CAT, ut_c.COL_SUBCAT]:
+    for col in [COL_SCALE_ID, COL_CAT, COL_SUBCAT]:
         if col not in df_cat:
             raise ValueError(f"'{col}' not in 'df_cat'")
     # Check scales from df_cat and df_scales do match
     if df_scales is not None:
-        scales_cat = list(df_cat[ut_c.COL_SCALE_ID])
+        scales_cat = list(df_cat[COL_SCALE_ID])
         scales = list(df_scales)
         overlap_scales = [x for x in scales if x in scales_cat]
         difference_scales = list(set(scales).difference(set(scales_cat)))
         # Adjust df_cat and df_scales
-        df_cat = df_cat[df_cat[ut_c.COL_SCALE_ID].isin(overlap_scales)]
+        df_cat = df_cat[df_cat[COL_SCALE_ID].isin(overlap_scales)]
         df_scales = df_scales[overlap_scales]
         if verbose and len(difference_scales) > 0:
             str_warning = f"Scales from 'df_scales' and 'df_cat' do not overlap completely."
@@ -171,7 +266,7 @@ def check_df_cat(df_cat=None, df_scales=None, accept_none=True, verbose=True):
 
 def check_df_scales(df_scales=None, df_parts=None, accept_none=False, accept_gaps=False):
     """Check if df_scales is a valid input and matching to df_parts"""
-    ut_check.check_bool(name="accept_gaps", val=accept_gaps)
+    check_bool(name="accept_gaps", val=accept_gaps)
     if accept_none and df_scales is None:
         return  # Skip check
     if not isinstance(df_scales, pd.DataFrame):
@@ -223,16 +318,16 @@ def check_df_feat(df_feat=None, df_cat=None):
     if len(df_feat) == 0 or len(list(df_feat)) == 0:
         raise ValueError("'df_feat' should be not empty")
     # Check if feature column in df_feat
-    if ut_c.COL_FEATURE not in df_feat:
-        raise ValueError(f"'{ut_c.COL_FEATURE}' must be column in 'df_feat'")
-    list_feat = list(df_feat[ut_c.COL_FEATURE])
+    if COL_FEATURE not in df_feat:
+        raise ValueError(f"'{COL_FEATURE}' must be column in 'df_feat'")
+    list_feat = list(df_feat[COL_FEATURE])
     for feat in list_feat:
         if feat.count("-") != 2:
             raise ValueError(f"'{feat}' is no valid feature")
     # Check if df_feat matches df_cat
     if df_cat is not None:
         scales = set([x.split("-")[2] for x in list_feat])
-        list_scales = list(df_cat[ut_c.COL_SCALE_ID])
+        list_scales = list(df_cat[COL_SCALE_ID])
         missing_scales = [x for x in scales if x not in list_scales]
         if len(missing_scales) > 0:
             raise ValueError(f"Following scales occur in 'df_feat' but not in 'df_cat': {missing_scales}")
diff --git a/tests/_utils.py b/tests/_utils.py
index 205b41f5..7dd20404 100644
--- a/tests/_utils.py
+++ b/tests/_utils.py
@@ -9,7 +9,7 @@
 
 # Helper Function
 def _folder_path(super_folder, folder_name):
-    """Modification of separator (OS depending)"""
+    """Modification of separator (OS-depending)"""
     path = os.path.join(super_folder, folder_name + SEP)
     return path
 
diff --git a/tests/unit/.hypothesis/unicode_data/13.0.0/codec-utf-8.json.gz b/tests/unit/.hypothesis/unicode_data/13.0.0/codec-utf-8.json.gz
index 86a33b00..7df482fa 100644
Binary files a/tests/unit/.hypothesis/unicode_data/13.0.0/codec-utf-8.json.gz and b/tests/unit/.hypothesis/unicode_data/13.0.0/codec-utf-8.json.gz differ
diff --git a/tests/unit/test_cpp_feature.py b/tests/unit/test_cpp_feature.py
index 9bb77b8e..53be9b61 100644
--- a/tests/unit/test_cpp_feature.py
+++ b/tests/unit/test_cpp_feature.py
@@ -11,55 +11,6 @@
 
 
 # I Unit Tests
-class TestLoadScales:
-    """Unit test for loading scales"""
-
-    # Positive unit test
-    def test_load_data(self):
-        """Unit test for aa.SequenceFeature().load_scales() method"""
-        sf = aa.SequenceFeature()
-        assert isinstance(sf.load_scales(clust_th=0.5), pd.DataFrame)
-
-    # Negative test
-    def test_wrong_clustered_values(self):
-        sf = aa.SequenceFeature()
-        for i in [0.1, -0.2, "a", None]:
-            with pytest.raises(ValueError):
-                sf.load_scales(clust_th=i)
-
-    # Property-based testing
-    @given(clustered=some.floats(min_value=-10, max_value=10))
-    def test_clustered_integer(self, clustered):
-        sf = aa.SequenceFeature()
-        if clustered not in [0.5, 0.7]:
-            with pytest.raises(ValueError):
-                sf.load_scales(clust_th=clustered)
-
-
-class TestLoadCategories:
-    """Unit test for loading DataFrame with sequence categories"""
-
-    # Positive unit test
-    def test_load_categories(self):
-        sf = aa.SequenceFeature()
-        assert isinstance(aa.load_scales(clust_th=0.5), pd.DataFrame)
-
-    # Negative test
-    def test_wrong_clustered_values(self):
-        sf = aa.SequenceFeature()
-        for i in [0.1, -0.2, "a", None]:
-            with pytest.raises(ValueError):
-                aa.load_scales(clust_th=i)
-
-    # Property-based testing
-    @given(clustered=some.floats(min_value=-10, max_value=10))
-    def test_clustered_integer(self, clustered):
-        sf = aa.SequenceFeature()
-        if clustered not in [0.5, 0.7]:
-            with pytest.raises(ValueError):
-                aa.load_scales(clust_th=clustered)
-
-
 class TestGetDfParts:
     """Unit test for loading DataFrame with sequence parts"""
 
@@ -407,10 +358,10 @@ def test_sequence_feature(list_splits):
     """Positive regression/functional test of all aa.SequenceFeature() methods"""
     sf = aa.SequenceFeature()
     # Get test set of sequences
-    df_seq = sf.load_sequences()
+    df_seq = aa.load_dataset()
     # Get feature components
     df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=False)
-    df_scales = sf.load_scales()
+    df_scales = aa.load_scales()
     split_kws = sf.get_split_kws()
     # Get features (names, values, matrix)
     features = sf.get_features()[0:100]
diff --git a/tutorials/prelude_on_plotting.ipynb b/tutorials/prelude_on_plotting.ipynb
new file mode 100644
index 00000000..29cd7437
--- /dev/null
+++ b/tutorials/prelude_on_plotting.ipynb
@@ -0,0 +1,35 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tutorials/tutorial1_quick_start.ipynb b/tutorials/tutorial1_quick_start.ipynb
old mode 100755
new mode 100644
index 008c3d8d..13033a25
--- a/tutorials/tutorial1_quick_start.ipynb
+++ b/tutorials/tutorial1_quick_start.ipynb
@@ -1,22 +1,55 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Quick Start with AAanalysis\n",
+    "Dive into the powerful capabilities of ``AAanalysis``—a Python framework dedicated to sequence-based, alignment-free protein prediction. In this tutorial, using gamma-secretase substrates and non-substrates as an example, we'll focus on extracting interpretable features from protein sequences using the ``AAclust`` and ``CPP`` models and how they can be harnessed for binary classification tasks.\n",
+    "\n",
+    "What You Will Learn:\n",
+    "- ``Loading Sequences and Scales``: How to easily load protein sequences and their amino acid scales.\n",
+    "- ``Feature Engineering``: Extract essential features using the ``AAclust`` and ``CPP`` models.\n",
+    "- ``Protein Prediction``: Make predictions using the RandomForest model.\n",
+    "- ``Explainable AI``: Interpret predictions at the group and individual levels by combining ``CPP`` with ``SHAP``.\n",
+    "\n",
+    "## 1. Loading Sequences and Scales\n",
+    "With AAanalysis, you have access to numerous benchmark datasets for protein sequence analysis. Using our γ-secretase substrates and non-substrates dataset as a hands-on example, you can effortlessly retrieve these datasets using the ``aa.load_dataset()`` function. Furthermore, amino acid scales, predominantly from AAindex, along with their hierarchical classification (known as ``AAontology``), are available at your fingertips with the ``aa.load_scales()`` function."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {
     "pycharm": {
-     "name": "#%%\n",
-     "is_executing": true
+     "name": "#%%\n"
+    },
+    "ExecuteTime": {
+     "end_time": "2023-09-23T14:15:04.562034649Z",
+     "start_time": "2023-09-23T14:15:04.508201346Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "     entry                                           sequence  label  tmd_start  tmd_stop       jmd_n                      tmd       jmd_c\n0   Q14802  MQKVTLGLLVFLAGFPVLDANDLEDKNSPFYYDWHSLQVGGLICAG...      0         37        59  NSPFYYDWHS  LQVGGLICAGVLCAMGIIIVMSA  KCKCKFGQKS\n1   Q86UE4  MAARSWQDELAQQAEEGSARLREMLSVGLGFLRTELGLDLGLEPKR...      0         50        72  LGLEPKRYPG  WVILVGTGALGLLLLFLLGYGWA  AACAGARKKR\n2   Q969W9  MHRLMGVNSTAAAAAGQPNVSCTCNCKRSLFQSMEITELEFVQIII...      0         41        63  FQSMEITELE  FVQIIIIVVVMMVMVVVITCLLS  HYKLSARSFI\n3   P53801  MAPGVARGPTPYWRLRLGGAALLLLLIPVAAAQEPPGAACSQNTNK...      0         97       119  RWGVCWVNFE  ALIITMSVVGGTLLLGIAICCCC  CCRRKRSRKP\n4   Q8IUW5  MAPRALPGSAVLAAAVFVGGAVSSPLVAPDNGSSRTLHSRTETTPS...      0         59        81  NDTGNGHPEY  IAYALVPVFFIMGLFGVLICHLL  KKKGYRCTTE\n..     ...                                                ...    ...        ...       ...         ...                      ...         ...\n95  P15209  MSPWLKWHGPAMARLWGLCLLVLGFWRASLACPTSCKCSSARIWCT...      1        431       453  VADQSNREHL  SVYAVVVIASVVGFCLLVMLLLL  KLARHSKFGM\n96  Q86YL7  MWKVSALLFVLGSASLWVLAEGASTGQPEDDTETTGLEGGVAMPGA...      1        130       152  TVEKDGLSTV  TLVGIIVGVLLAIGFIGAIIVVV  MRKMSGRYSP\n97  Q13308  MGAARGSPARPRRLPLLSVLLLPLLGGTQTAIVFIKQPSSQDALQG...      1        704       726  GSPPPYKMIQ  TIGLSVGAAVAYIIAVLGLMFYC  KKRCKAKRLQ\n98  P10586  MAPEPAPGRTMVPLVPALVMLGLVAGAHGDSKPVFIKVPEDQTGLS...      1       1262      1284  PAQQQEEPEM  LWVTGPVLAVILIILIVIAILLF  KRKRTHSPSS\n99  P28828  MRTLGTCLVTLAGLLLTAAGETFSGGCLFDEPYSTCGYSQADEDDF...      1        743       764  PEKQTDHTVK   IAGVIAGILLFVIIFLGVVLVM  KKRKLAKKRK\n\n[100 rows x 8 columns]",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>entry</th>\n      <th>sequence</th>\n      <th>label</th>\n      <th>tmd_start</th>\n      <th>tmd_stop</th>\n      <th>jmd_n</th>\n      <th>tmd</th>\n      <th>jmd_c</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Q14802</td>\n      <td>MQKVTLGLLVFLAGFPVLDANDLEDKNSPFYYDWHSLQVGGLICAG...</td>\n      <td>0</td>\n      <td>37</td>\n      <td>59</td>\n      <td>NSPFYYDWHS</td>\n      <td>LQVGGLICAGVLCAMGIIIVMSA</td>\n      <td>KCKCKFGQKS</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Q86UE4</td>\n      <td>MAARSWQDELAQQAEEGSARLREMLSVGLGFLRTELGLDLGLEPKR...</td>\n      <td>0</td>\n      <td>50</td>\n      <td>72</td>\n      <td>LGLEPKRYPG</td>\n      <td>WVILVGTGALGLLLLFLLGYGWA</td>\n      <td>AACAGARKKR</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Q969W9</td>\n      <td>MHRLMGVNSTAAAAAGQPNVSCTCNCKRSLFQSMEITELEFVQIII...</td>\n      <td>0</td>\n      <td>41</td>\n      <td>63</td>\n      <td>FQSMEITELE</td>\n      <td>FVQIIIIVVVMMVMVVVITCLLS</td>\n      <td>HYKLSARSFI</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>P53801</td>\n      <td>MAPGVARGPTPYWRLRLGGAALLLLLIPVAAAQEPPGAACSQNTNK...</td>\n      <td>0</td>\n      <td>97</td>\n      <td>119</td>\n      <td>RWGVCWVNFE</td>\n      <td>ALIITMSVVGGTLLLGIAICCCC</td>\n      <td>CCRRKRSRKP</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Q8IUW5</td>\n      <td>MAPRALPGSAVLAAAVFVGGAVSSPLVAPDNGSSRTLHSRTETTPS...</td>\n      <td>0</td>\n      <td>59</td>\n      <td>81</td>\n      <td>NDTGNGHPEY</td>\n      <td>IAYALVPVFFIMGLFGVLICHLL</td>\n      <td>KKKGYRCTTE</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>95</th>\n      <td>P15209</td>\n      <td>MSPWLKWHGPAMARLWGLCLLVLGFWRASLACPTSCKCSSARIWCT...</td>\n      <td>1</td>\n      <td>431</td>\n      <td>453</td>\n      <td>VADQSNREHL</td>\n      <td>SVYAVVVIASVVGFCLLVMLLLL</td>\n      <td>KLARHSKFGM</td>\n    </tr>\n    <tr>\n      <th>96</th>\n      <td>Q86YL7</td>\n      <td>MWKVSALLFVLGSASLWVLAEGASTGQPEDDTETTGLEGGVAMPGA...</td>\n      <td>1</td>\n      <td>130</td>\n      <td>152</td>\n      <td>TVEKDGLSTV</td>\n      <td>TLVGIIVGVLLAIGFIGAIIVVV</td>\n      <td>MRKMSGRYSP</td>\n    </tr>\n    <tr>\n      <th>97</th>\n      <td>Q13308</td>\n      <td>MGAARGSPARPRRLPLLSVLLLPLLGGTQTAIVFIKQPSSQDALQG...</td>\n      <td>1</td>\n      <td>704</td>\n      <td>726</td>\n      <td>GSPPPYKMIQ</td>\n      <td>TIGLSVGAAVAYIIAVLGLMFYC</td>\n      <td>KKRCKAKRLQ</td>\n    </tr>\n    <tr>\n      <th>98</th>\n      <td>P10586</td>\n      <td>MAPEPAPGRTMVPLVPALVMLGLVAGAHGDSKPVFIKVPEDQTGLS...</td>\n      <td>1</td>\n      <td>1262</td>\n      <td>1284</td>\n      <td>PAQQQEEPEM</td>\n      <td>LWVTGPVLAVILIILIVIAILLF</td>\n      <td>KRKRTHSPSS</td>\n    </tr>\n    <tr>\n      <th>99</th>\n      <td>P28828</td>\n      <td>MRTLGTCLVTLAGLLLTAAGETFSGGCLFDEPYSTCGYSQADEDDF...</td>\n      <td>1</td>\n      <td>743</td>\n      <td>764</td>\n      <td>PEKQTDHTVK</td>\n      <td>IAGVIAGILLFVIIFLGVVLVM</td>\n      <td>KKRKLAKKRK</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 8 columns</p>\n</div>"
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "import numpy as np\n",
-    "from sklearn.cluster import AgglomerativeClustering\n",
-    "from sklearn.ensemble import RandomForestClassifier\n",
-    "from sklearn.model_selection import cross_val_score\n",
-    "\n",
-    "import aaanalysis as aa"
+    "import aaanalysis as aa\n",
+    "# Load scales and scale categories (AAontology) \n",
+    "df_scales = aa.load_scales()\n",
+    "df_cat = aa.load_scales(name=\"scales_cat\")\n",
+    "# Load training data\n",
+    "df_seq = aa.load_dataset(name=\"DOM_GSEC\", n=50)\n",
+    "df_seq"
    ]
   },
   {
@@ -27,100 +60,144 @@
     }
    },
    "source": [
-    "# Loading scales and sequence data using AAanalysis\n",
-    "AAanalysis is a python framework for sequence-based and alignment-free protein prediction. It comprises several example datasets for sequence and residue predictions tasks, which can be retrieved using the aa.load_dataset() function. Amino acid scales (most from AAindex) and their hierarchical classification (named AAontology) can be accessed using the aa.load_scales() function. Since redundancy is an essential problem for machine learning tasks, the AAclust() object provides a lightweight wrapper for sklearn clustering algorithms such as Agglomerative clustering. AAclust clusters a set of scales and selects for each cluster the most representative scale (i.e., the scale closes to the cluster center)."
+    "## 2.  Feature Engineering\n",
+    "The centerpiece of AAanalysis is the Comparative Physicochemical Profiling (``CPP``) model, which is supported by ``AAclust`` for the pre-selection of amino acid scales. \n",
+    "\n",
+    "### AAclust\n",
+    "Since redundancy is an essential problem for machine learning tasks, the ``AAclust`` object provides a lightweight wrapper for sklearn clustering algorithms such as Agglomerative clustering. AAclust clusters a set of scales and selects for each cluster the most representative scale (i.e., the scale closes to the cluster center)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
+   "execution_count": 9,
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "               entry                                           sequence  label\n",
-      "1        DISULFIDE_2  PLHHLXIGTWTPPGAIFTVQFDDEKLTCKLIKRTEIPQDEPISWXT...      0\n",
-      "4        DISULFIDE_5  GFPELKNDTFLRAAWGEETDYTPVWCMRQAGRYLPEFRETRAAQDF...      0\n",
-      "5        DISULFIDE_6  MKIIRIETSRIAVPLTKPFKTALRTVYTAESVIVRITYDSGAVGWG...      0\n",
-      "7        DISULFIDE_8  AAERVFISPAKYVQGKNVITKIANYLEGIGNKTVVIADEIVWKIAG...      0\n",
-      "9       DISULFIDE_10  MKFTVEREHLLKPLQQVSGPLGGRPTLPILGNLLLQVADGTLSLTG...      0\n",
-      "...              ...                                                ...    ...\n",
-      "2130  DISULFIDE_2131  DSLDEQRSRYAQIKQAWDNRQMDVVEQMMPGLKDYPLYPYLEYRQI...      1\n",
-      "2132  DISULFIDE_2133  SRTHVCQSDTHIFIIMGASGDLAKKKIYPTIWWLFRDGLLPENTFI...      1\n",
-      "2134  DISULFIDE_2135  ATTYNAVVSKSSSDGKTFKTIADAIASAPAGSTPFVILIKNGVYNE...      1\n",
-      "2137  DISULFIDE_2138  KQFSQEFRDGYSILKHYGGNGPYSERVSYGIARDPPTSCEVDQVIM...      1\n",
-      "2147  DISULFIDE_2148  MRNRREVSKLLSERVLLLDGAYGTEFMKYGYDDLPEELNIKAPDVV...      1\n",
-      "\n",
-      "[200 rows x 3 columns]\n"
-     ]
+     "data": {
+      "text/plain": "    SUEM840101  NISK860101  KANM800101  CHOP780101  MIYS990105  FAUJ880103  QIAN880126  MUNV940105  LINS030104  JOND920101\nAA                                                                                                                        \nA        0.788       0.406       0.875       0.174       0.492       0.124       0.451       0.175       0.093       0.818\nC        0.544       0.906       0.312       0.661       0.016       0.301       0.324       0.089       0.000       0.078\nD        0.146       0.006       0.542       0.908       0.825       0.344       0.745       0.337       0.588       0.494\nE        0.622       0.055       1.000       0.248       0.857       0.468       0.471       0.182       0.804       0.623\nF        0.813       0.968       0.552       0.119       0.000       0.729       0.186       0.066       0.082       0.338\nG        0.000       0.262       0.115       1.000       0.492       0.000       0.676       0.393       0.144       0.779\nH        0.425       0.559       0.615       0.440       0.492       0.577       0.696       0.125       0.423       0.117\nI        0.901       1.000       0.583       0.000       0.079       0.495       0.314       0.050       0.010       0.506\nK        0.571       0.000       0.729       0.495       1.000       0.590       0.088       0.155       1.000       0.584\nL        0.901       0.942       0.719       0.110       0.016       0.495       0.059       0.152       0.041       1.000\nM        1.000       0.788       0.969       0.119       0.127       0.548       0.000       0.083       0.082       0.130\nN        0.317       0.157       0.385       1.000       0.683       0.365       0.608       0.244       0.557       0.377\nP        0.112       0.118       0.000       0.963       0.698       0.337       0.873       1.000       0.454       0.481\nQ        0.634       0.145       0.646       0.468       0.762       0.489       0.471       0.152       0.711       0.351\nR        0.726       0.333       0.500       0.440       0.651       0.759       0.529       0.119       0.845       0.481\nS        0.278       0.137       0.229       0.881       0.698       0.198       0.931       0.109       0.309       0.714\nT        0.371       0.305       0.302       0.450       0.603       0.322       1.000       0.040       0.330       0.584\nV        0.589       0.884       0.438       0.028       0.159       0.371       0.510       0.000       0.031       0.675\nW        0.847       0.961       0.469       0.450       0.095       1.000       0.196       0.076       0.206       0.000\nY        0.704       0.649       0.281       0.615       0.159       0.801       0.853       0.036       0.268       0.234",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>SUEM840101</th>\n      <th>NISK860101</th>\n      <th>KANM800101</th>\n      <th>CHOP780101</th>\n      <th>MIYS990105</th>\n      <th>FAUJ880103</th>\n      <th>QIAN880126</th>\n      <th>MUNV940105</th>\n      <th>LINS030104</th>\n      <th>JOND920101</th>\n    </tr>\n    <tr>\n      <th>AA</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>A</th>\n      <td>0.788</td>\n      <td>0.406</td>\n      <td>0.875</td>\n      <td>0.174</td>\n      <td>0.492</td>\n      <td>0.124</td>\n      <td>0.451</td>\n      <td>0.175</td>\n      <td>0.093</td>\n      <td>0.818</td>\n    </tr>\n    <tr>\n      <th>C</th>\n      <td>0.544</td>\n      <td>0.906</td>\n      <td>0.312</td>\n      <td>0.661</td>\n      <td>0.016</td>\n      <td>0.301</td>\n      <td>0.324</td>\n      <td>0.089</td>\n      <td>0.000</td>\n      <td>0.078</td>\n    </tr>\n    <tr>\n      <th>D</th>\n      <td>0.146</td>\n      <td>0.006</td>\n      <td>0.542</td>\n      <td>0.908</td>\n      <td>0.825</td>\n      <td>0.344</td>\n      <td>0.745</td>\n      <td>0.337</td>\n      <td>0.588</td>\n      <td>0.494</td>\n    </tr>\n    <tr>\n      <th>E</th>\n      <td>0.622</td>\n      <td>0.055</td>\n      <td>1.000</td>\n      <td>0.248</td>\n      <td>0.857</td>\n      <td>0.468</td>\n      <td>0.471</td>\n      <td>0.182</td>\n      <td>0.804</td>\n      <td>0.623</td>\n    </tr>\n    <tr>\n      <th>F</th>\n      <td>0.813</td>\n      <td>0.968</td>\n      <td>0.552</td>\n      <td>0.119</td>\n      <td>0.000</td>\n      <td>0.729</td>\n      <td>0.186</td>\n      <td>0.066</td>\n      <td>0.082</td>\n      <td>0.338</td>\n    </tr>\n    <tr>\n      <th>G</th>\n      <td>0.000</td>\n      <td>0.262</td>\n      <td>0.115</td>\n      <td>1.000</td>\n      <td>0.492</td>\n      <td>0.000</td>\n      <td>0.676</td>\n      <td>0.393</td>\n      <td>0.144</td>\n      <td>0.779</td>\n    </tr>\n    <tr>\n      <th>H</th>\n      <td>0.425</td>\n      <td>0.559</td>\n      <td>0.615</td>\n      <td>0.440</td>\n      <td>0.492</td>\n      <td>0.577</td>\n      <td>0.696</td>\n      <td>0.125</td>\n      <td>0.423</td>\n      <td>0.117</td>\n    </tr>\n    <tr>\n      <th>I</th>\n      <td>0.901</td>\n      <td>1.000</td>\n      <td>0.583</td>\n      <td>0.000</td>\n      <td>0.079</td>\n      <td>0.495</td>\n      <td>0.314</td>\n      <td>0.050</td>\n      <td>0.010</td>\n      <td>0.506</td>\n    </tr>\n    <tr>\n      <th>K</th>\n      <td>0.571</td>\n      <td>0.000</td>\n      <td>0.729</td>\n      <td>0.495</td>\n      <td>1.000</td>\n      <td>0.590</td>\n      <td>0.088</td>\n      <td>0.155</td>\n      <td>1.000</td>\n      <td>0.584</td>\n    </tr>\n    <tr>\n      <th>L</th>\n      <td>0.901</td>\n      <td>0.942</td>\n      <td>0.719</td>\n      <td>0.110</td>\n      <td>0.016</td>\n      <td>0.495</td>\n      <td>0.059</td>\n      <td>0.152</td>\n      <td>0.041</td>\n      <td>1.000</td>\n    </tr>\n    <tr>\n      <th>M</th>\n      <td>1.000</td>\n      <td>0.788</td>\n      <td>0.969</td>\n      <td>0.119</td>\n      <td>0.127</td>\n      <td>0.548</td>\n      <td>0.000</td>\n      <td>0.083</td>\n      <td>0.082</td>\n      <td>0.130</td>\n    </tr>\n    <tr>\n      <th>N</th>\n      <td>0.317</td>\n      <td>0.157</td>\n      <td>0.385</td>\n      <td>1.000</td>\n      <td>0.683</td>\n      <td>0.365</td>\n      <td>0.608</td>\n      <td>0.244</td>\n      <td>0.557</td>\n      <td>0.377</td>\n    </tr>\n    <tr>\n      <th>P</th>\n      <td>0.112</td>\n      <td>0.118</td>\n      <td>0.000</td>\n      <td>0.963</td>\n      <td>0.698</td>\n      <td>0.337</td>\n      <td>0.873</td>\n      <td>1.000</td>\n      <td>0.454</td>\n      <td>0.481</td>\n    </tr>\n    <tr>\n      <th>Q</th>\n      <td>0.634</td>\n      <td>0.145</td>\n      <td>0.646</td>\n      <td>0.468</td>\n      <td>0.762</td>\n      <td>0.489</td>\n      <td>0.471</td>\n      <td>0.152</td>\n      <td>0.711</td>\n      <td>0.351</td>\n    </tr>\n    <tr>\n      <th>R</th>\n      <td>0.726</td>\n      <td>0.333</td>\n      <td>0.500</td>\n      <td>0.440</td>\n      <td>0.651</td>\n      <td>0.759</td>\n      <td>0.529</td>\n      <td>0.119</td>\n      <td>0.845</td>\n      <td>0.481</td>\n    </tr>\n    <tr>\n      <th>S</th>\n      <td>0.278</td>\n      <td>0.137</td>\n      <td>0.229</td>\n      <td>0.881</td>\n      <td>0.698</td>\n      <td>0.198</td>\n      <td>0.931</td>\n      <td>0.109</td>\n      <td>0.309</td>\n      <td>0.714</td>\n    </tr>\n    <tr>\n      <th>T</th>\n      <td>0.371</td>\n      <td>0.305</td>\n      <td>0.302</td>\n      <td>0.450</td>\n      <td>0.603</td>\n      <td>0.322</td>\n      <td>1.000</td>\n      <td>0.040</td>\n      <td>0.330</td>\n      <td>0.584</td>\n    </tr>\n    <tr>\n      <th>V</th>\n      <td>0.589</td>\n      <td>0.884</td>\n      <td>0.438</td>\n      <td>0.028</td>\n      <td>0.159</td>\n      <td>0.371</td>\n      <td>0.510</td>\n      <td>0.000</td>\n      <td>0.031</td>\n      <td>0.675</td>\n    </tr>\n    <tr>\n      <th>W</th>\n      <td>0.847</td>\n      <td>0.961</td>\n      <td>0.469</td>\n      <td>0.450</td>\n      <td>0.095</td>\n      <td>1.000</td>\n      <td>0.196</td>\n      <td>0.076</td>\n      <td>0.206</td>\n      <td>0.000</td>\n    </tr>\n    <tr>\n      <th>Y</th>\n      <td>0.704</td>\n      <td>0.649</td>\n      <td>0.281</td>\n      <td>0.615</td>\n      <td>0.159</td>\n      <td>0.801</td>\n      <td>0.853</td>\n      <td>0.036</td>\n      <td>0.268</td>\n      <td>0.234</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "# Load training data\n",
-    "df_info = aa.load_dataset()\n",
-    "df = aa.load_dataset(name=\"SEQ_DISULFIDE\", min_len=300, n=100)\n",
-    "print(df)\n",
-    "# Load scales and scale categories from AAanalysis\n",
-    "df_scales = aa.load_scales()\n",
-    "df_cat = aa.load_scales(name=\"scales_cat\")\n",
-    "# Select scales using AAclust\n",
+    "from sklearn.cluster import AgglomerativeClustering\n",
+    "import numpy as np\n",
     "aac = aa.AAclust(model=AgglomerativeClustering, model_kwargs=dict(linkage=\"ward\"))\n",
     "X = np.array(df_scales).T\n",
-    "scales = aac.fit(X, n_clusters=10, names=list(df_scales))   # Number of clusters = number of selected scales (100 is recommended)\n",
-    "df_cat = df_cat[df_cat[\"scale_id\"].isin(scales)]\n",
-    "df_scales = df_scales[scales]"
-   ]
+    "scales = aac.fit(X, n_clusters=10, names=list(df_scales)) \n",
+    "df_scales = df_scales[scales]\n",
+    "df_scales"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-09-23T14:15:07.346288614Z",
+     "start_time": "2023-09-23T14:15:07.280822767Z"
+    }
+   }
   },
   {
    "cell_type": "markdown",
+   "source": [
+    "### Comparative Physicochemical Profiling (CPP)\n",
+    " CPP is a sequence-based feature engineering algorithm. It aims at identifying a set of features most discriminant between two sets of sequences: the test set and the reference set. Supported by the ``SequenceFeature`` object (``sf``), A CPP feature integrates:\n",
+    " \n",
+    "- ``Parts``: Are combination of a target middle domain (TMD) and N- and C-terminal adjacent regions (JMD-N and JMD-C, respectively), obtained ``sf.get_df_parts()``.\n",
+    "- ``Splits``: These `Parts` can be split into various continuous segments or discontinuous patterns, specified ``sf.get_split_kws()``. \n",
+    "- ``Scales``: Sets of amino acid scales."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
    "metadata": {
     "pycharm": {
-     "name": "#%% md\n"
+     "name": "#%%\n"
+    },
+    "ExecuteTime": {
+     "end_time": "2023-09-23T14:15:11.021981566Z",
+     "start_time": "2023-09-23T14:15:10.913819589Z"
     }
    },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "                            tmd             jmd_n_tmd_n            tmd_c_jmd_c\nQ14802  LQVGGLICAGVLCAMGIIIVMSA  NSPFYYDWHSLQVGGLICAGVL  CAMGIIIVMSAKCKCKFGQKS\nQ86UE4  WVILVGTGALGLLLLFLLGYGWA  LGLEPKRYPGWVILVGTGALGL  LLLFLLGYGWAAACAGARKKR\nQ969W9  FVQIIIIVVVMMVMVVVITCLLS  FQSMEITELEFVQIIIIVVVMM  VMVVVITCLLSHYKLSARSFI\nP53801  ALIITMSVVGGTLLLGIAICCCC  RWGVCWVNFEALIITMSVVGGT  LLLGIAICCCCCCRRKRSRKP\nQ8IUW5  IAYALVPVFFIMGLFGVLICHLL  NDTGNGHPEYIAYALVPVFFIM  GLFGVLICHLLKKKGYRCTTE\n...                         ...                     ...                    ...\nP15209  SVYAVVVIASVVGFCLLVMLLLL  VADQSNREHLSVYAVVVIASVV  GFCLLVMLLLLKLARHSKFGM\nQ86YL7  TLVGIIVGVLLAIGFIGAIIVVV  TVEKDGLSTVTLVGIIVGVLLA  IGFIGAIIVVVMRKMSGRYSP\nQ13308  TIGLSVGAAVAYIIAVLGLMFYC  GSPPPYKMIQTIGLSVGAAVAY  IIAVLGLMFYCKKRCKAKRLQ\nP10586  LWVTGPVLAVILIILIVIAILLF  PAQQQEEPEMLWVTGPVLAVIL  IILIVIAILLFKRKRTHSPSS\nP28828   IAGVIAGILLFVIIFLGVVLVM   PEKQTDHTVKIAGVIAGILLF  VIIFLGVVLVMKKRKLAKKRK\n\n[100 rows x 3 columns]",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>tmd</th>\n      <th>jmd_n_tmd_n</th>\n      <th>tmd_c_jmd_c</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Q14802</th>\n      <td>LQVGGLICAGVLCAMGIIIVMSA</td>\n      <td>NSPFYYDWHSLQVGGLICAGVL</td>\n      <td>CAMGIIIVMSAKCKCKFGQKS</td>\n    </tr>\n    <tr>\n      <th>Q86UE4</th>\n      <td>WVILVGTGALGLLLLFLLGYGWA</td>\n      <td>LGLEPKRYPGWVILVGTGALGL</td>\n      <td>LLLFLLGYGWAAACAGARKKR</td>\n    </tr>\n    <tr>\n      <th>Q969W9</th>\n      <td>FVQIIIIVVVMMVMVVVITCLLS</td>\n      <td>FQSMEITELEFVQIIIIVVVMM</td>\n      <td>VMVVVITCLLSHYKLSARSFI</td>\n    </tr>\n    <tr>\n      <th>P53801</th>\n      <td>ALIITMSVVGGTLLLGIAICCCC</td>\n      <td>RWGVCWVNFEALIITMSVVGGT</td>\n      <td>LLLGIAICCCCCCRRKRSRKP</td>\n    </tr>\n    <tr>\n      <th>Q8IUW5</th>\n      <td>IAYALVPVFFIMGLFGVLICHLL</td>\n      <td>NDTGNGHPEYIAYALVPVFFIM</td>\n      <td>GLFGVLICHLLKKKGYRCTTE</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>P15209</th>\n      <td>SVYAVVVIASVVGFCLLVMLLLL</td>\n      <td>VADQSNREHLSVYAVVVIASVV</td>\n      <td>GFCLLVMLLLLKLARHSKFGM</td>\n    </tr>\n    <tr>\n      <th>Q86YL7</th>\n      <td>TLVGIIVGVLLAIGFIGAIIVVV</td>\n      <td>TVEKDGLSTVTLVGIIVGVLLA</td>\n      <td>IGFIGAIIVVVMRKMSGRYSP</td>\n    </tr>\n    <tr>\n      <th>Q13308</th>\n      <td>TIGLSVGAAVAYIIAVLGLMFYC</td>\n      <td>GSPPPYKMIQTIGLSVGAAVAY</td>\n      <td>IIAVLGLMFYCKKRCKAKRLQ</td>\n    </tr>\n    <tr>\n      <th>P10586</th>\n      <td>LWVTGPVLAVILIILIVIAILLF</td>\n      <td>PAQQQEEPEMLWVTGPVLAVIL</td>\n      <td>IILIVIAILLFKRKRTHSPSS</td>\n    </tr>\n    <tr>\n      <th>P28828</th>\n      <td>IAGVIAGILLFVIIFLGVVLVM</td>\n      <td>PEKQTDHTVKIAGVIAGILLF</td>\n      <td>VIIFLGVVLVMKKRKLAKKRK</td>\n    </tr>\n  </tbody>\n</table>\n<p>100 rows × 3 columns</p>\n</div>"
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# Feature Engineering\n",
-    "*Comparative Physicochemical Profiling (CPP)* is a sequence-based feature engineering algorithm aiming at identifying a set of features that is most discriminant between two sets of sequences, called test set and reference set. A CPP feature is a combination of a *Part*, a *Split*, and a *Scale*. *Parts* are combination of a target middle domain (TMD) and N- and C-terminal adjacent regions (JMD-N and JMD-C, respectively). They can be obtained from a dataframe with sequences using the sf.get_df_parts() method from the SequenceFeature class, where the length of the JMDs can be specified. These *Parts* can be split into various continuous segments or discontinuous patterns, for which the sf.get_split_kws() method creates a parameter dictionary. The scales (and or scale categories), the parts, and the split parameters are used to instantiates the CPP class. Running the CPP algorithm creates all *Part*, *Split*, *Split* combinations and filters a selected maximum (via 'n_filter' argument) of non-redundant features."
+    "y = list(df_seq[\"label\"])\n",
+    "sf = aa.SequenceFeature()\n",
+    "df_parts = sf.get_df_parts(df_seq=df_seq, jmd_n_len=10, jmd_c_len=10)\n",
+    "split_kws = sf.get_split_kws(n_split_max=1, split_types=[\"Segment\"])\n",
+    "df_parts"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 3,
+   "cell_type": "markdown",
+   "source": [
+    "Running the CPP algorithm creates all `Part`, `Split`, `Split` combinations and filters a selected maximum of non-redundant features:"
+   ],
    "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1. CPP creates 30 features for 200 samples\n",
-      "   |#########################| 100.00%\n",
+      "1. CPP creates 30 features for 100 samples\n",
+      "   |#########################| 100.00%\u001B[0m91mm\n",
       "2. CPP pre-filters 1 features (5%) with highest 'abs_mean_dif' and 'max_std_test' <= 0.2\n",
-      "3. CPP filtering algorithm\n",
-      "4. CPP returns df with 1 unique features including general information and statistics\n"
+      "3. CPP filtering algorithm\n"
+     ]
+    },
+    {
+     "ename": "ValueError",
+     "evalue": "'jmd_n_seq' should be string (type=<class 'list'>)",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mValueError\u001B[0m                                Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[12], line 3\u001B[0m\n\u001B[1;32m      1\u001B[0m \u001B[38;5;66;03m# Small set of features (300 features created)\u001B[39;00m\n\u001B[1;32m      2\u001B[0m cpp \u001B[38;5;241m=\u001B[39m aa\u001B[38;5;241m.\u001B[39mCPP(df_parts\u001B[38;5;241m=\u001B[39mdf_parts, df_cat\u001B[38;5;241m=\u001B[39mdf_cat, df_scales\u001B[38;5;241m=\u001B[39mdf_scales, split_kws\u001B[38;5;241m=\u001B[39msplit_kws)\n\u001B[0;32m----> 3\u001B[0m df_feat \u001B[38;5;241m=\u001B[39m \u001B[43mcpp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrun\u001B[49m\u001B[43m(\u001B[49m\u001B[43mlabels\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43my\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtmd_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m20\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mn_filter\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m100\u001B[39;49m\u001B[43m)\u001B[49m  \n",
+      "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/cpp/cpp.py:310\u001B[0m, in \u001B[0;36mCPP.run\u001B[0;34m(self, labels, parametric, n_filter, tmd_len, jmd_n_len, jmd_c_len, ext_len, start, check_cat, n_pre_filter, pct_pre_filter, max_std_test, max_overlap, max_cor, n_processes)\u001B[0m\n\u001B[1;32m    308\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_verbose:\n\u001B[1;32m    309\u001B[0m     \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m3. CPP filtering algorithm\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m--> 310\u001B[0m df \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_add_positions\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdf_feat\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdf\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtmd_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtmd_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstart\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mstart\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    311\u001B[0m df \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_add_scale_info(df_feat\u001B[38;5;241m=\u001B[39mdf)\n\u001B[1;32m    312\u001B[0m df_feat \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_filtering(df\u001B[38;5;241m=\u001B[39mdf, n_filter\u001B[38;5;241m=\u001B[39mn_filter, check_cat\u001B[38;5;241m=\u001B[39mcheck_cat, max_overlap\u001B[38;5;241m=\u001B[39mmax_overlap, max_cor\u001B[38;5;241m=\u001B[39mmax_cor)\n",
+      "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/cpp/cpp.py:161\u001B[0m, in \u001B[0;36mCPP._add_positions\u001B[0;34m(df_feat, tmd_len, jmd_n_len, jmd_c_len, ext_len, start)\u001B[0m\n\u001B[1;32m    159\u001B[0m features \u001B[38;5;241m=\u001B[39m df_feat[ut\u001B[38;5;241m.\u001B[39mCOL_FEATURE]\u001B[38;5;241m.\u001B[39mto_list()\n\u001B[1;32m    160\u001B[0m sf \u001B[38;5;241m=\u001B[39m SequenceFeature()\n\u001B[0;32m--> 161\u001B[0m feat_positions \u001B[38;5;241m=\u001B[39m \u001B[43msf\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43madd_position\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfeatures\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mfeatures\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtmd_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtmd_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstart\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mstart\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    162\u001B[0m \u001B[43m                                 \u001B[49m\u001B[43mjmd_n_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mjmd_n_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mjmd_c_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mjmd_c_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mext_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mext_len\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    163\u001B[0m df_feat[ut\u001B[38;5;241m.\u001B[39mCOL_POSITION] \u001B[38;5;241m=\u001B[39m feat_positions\n\u001B[1;32m    164\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m df_feat\n",
+      "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/cpp/feature.py:614\u001B[0m, in \u001B[0;36mSequenceFeature.add_position\u001B[0;34m(df_feat, features, start, tmd_len, jmd_n_len, jmd_c_len, ext_len, part_split)\u001B[0m\n\u001B[1;32m    612\u001B[0m     ut\u001B[38;5;241m.\u001B[39mcheck_non_negative_number(name\u001B[38;5;241m=\u001B[39mname, val\u001B[38;5;241m=\u001B[39margs[name], just_int\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m, min_val\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m0\u001B[39m)\n\u001B[1;32m    613\u001B[0m sfp \u001B[38;5;241m=\u001B[39m SequenceFeaturePositions()\n\u001B[0;32m--> 614\u001B[0m dict_part_pos \u001B[38;5;241m=\u001B[39m \u001B[43msfp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_dict_part_pos\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtmd_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtmd_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    615\u001B[0m feat_positions \u001B[38;5;241m=\u001B[39m sfp\u001B[38;5;241m.\u001B[39mget_positions(dict_part_pos\u001B[38;5;241m=\u001B[39mdict_part_pos, features\u001B[38;5;241m=\u001B[39mfeatures)\n\u001B[1;32m    616\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m feat_positions\n",
+      "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/cpp/_feature_pos.py:52\u001B[0m, in \u001B[0;36mSequenceFeaturePositions.get_dict_part_pos\u001B[0;34m(tmd_len, jmd_n_len, jmd_c_len, ext_len, start)\u001B[0m\n\u001B[1;32m     50\u001B[0m tmd \u001B[38;5;241m=\u001B[39m [i \u001B[38;5;241m+\u001B[39m start \u001B[38;5;28;01mfor\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m tmd]\n\u001B[1;32m     51\u001B[0m jmd_c \u001B[38;5;241m=\u001B[39m [i \u001B[38;5;241m+\u001B[39m start \u001B[38;5;28;01mfor\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m jmd_c]\n\u001B[0;32m---> 52\u001B[0m dict_part_pos \u001B[38;5;241m=\u001B[39m \u001B[43mpa\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_dict_part_seq\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtmd_seq\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtmd\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mjmd_n_seq\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mjmd_n\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mjmd_c_seq\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mjmd_c\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mext_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mext_len\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m     53\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m dict_part_pos\n",
+      "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/cpp/_part.py:190\u001B[0m, in \u001B[0;36mParts.get_dict_part_seq\u001B[0;34m(df, entry, tmd_seq, jmd_n_seq, jmd_c_seq, ext_len)\u001B[0m\n\u001B[1;32m    188\u001B[0m     tmd_seq, jmd_n_seq, jmd_c_seq \u001B[38;5;241m=\u001B[39m _get_parts_from_df(df\u001B[38;5;241m=\u001B[39mdf, entry\u001B[38;5;241m=\u001B[39mentry)\n\u001B[1;32m    189\u001B[0m check_parts(tmd\u001B[38;5;241m=\u001B[39mtmd_seq, jmd_n\u001B[38;5;241m=\u001B[39mjmd_n_seq, jmd_c\u001B[38;5;241m=\u001B[39mjmd_c_seq)\n\u001B[0;32m--> 190\u001B[0m \u001B[43mut\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcheck_args_len\u001B[49m\u001B[43m(\u001B[49m\u001B[43mjmd_n_seq\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mjmd_n_seq\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mjmd_c_seq\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mjmd_c_seq\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mext_len\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mext_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43maccept_tmd_none\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m)\u001B[49m\n\u001B[1;32m    191\u001B[0m dict_part_seq \u001B[38;5;241m=\u001B[39m _get_dict_part_seq_from_seq(tmd\u001B[38;5;241m=\u001B[39mtmd_seq, jmd_n\u001B[38;5;241m=\u001B[39mjmd_n_seq, jmd_c\u001B[38;5;241m=\u001B[39mjmd_c_seq, ext_len\u001B[38;5;241m=\u001B[39mext_len)\n\u001B[1;32m    192\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m dict_part_seq\n",
+      "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/_utils/utils_cpp.py:119\u001B[0m, in \u001B[0;36mcheck_args_len\u001B[0;34m(tmd_len, jmd_n_len, jmd_c_len, ext_len, tmd_seq, jmd_n_seq, jmd_c_seq, accept_tmd_none)\u001B[0m\n\u001B[1;32m    117\u001B[0m \u001B[38;5;66;03m# Check if lengths and sequences match\u001B[39;00m\n\u001B[1;32m    118\u001B[0m tmd_len \u001B[38;5;241m=\u001B[39m _check_seq(tmd_seq, tmd_len, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtmd_seq\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtmd_len\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m--> 119\u001B[0m jmd_n_len \u001B[38;5;241m=\u001B[39m \u001B[43m_check_seq\u001B[49m\u001B[43m(\u001B[49m\u001B[43mjmd_n_seq\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mjmd_n_len\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mjmd_n_seq\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mjmd_n_len\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m    120\u001B[0m jmd_c_len \u001B[38;5;241m=\u001B[39m _check_seq(jmd_c_seq, jmd_c_len, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mjmd_c_seq\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mjmd_c_len\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m    121\u001B[0m \u001B[38;5;66;03m# Check if lengths are matching\u001B[39;00m\n",
+      "File \u001B[0;32m~/Programming/Pycharm_Projekte/1Packages/aaanalysis/aaanalysis/_utils/utils_cpp.py:100\u001B[0m, in \u001B[0;36m_check_seq\u001B[0;34m(seq, len_, name_seq, name_len)\u001B[0m\n\u001B[1;32m     98\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m     99\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(seq, \u001B[38;5;28mstr\u001B[39m):\n\u001B[0;32m--> 100\u001B[0m         \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mname_seq\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m should be string (type=\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mtype\u001B[39m(seq)\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m)\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m    101\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m len_ \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m    102\u001B[0m         \u001B[38;5;66;03m# Waring sequence length doesn't match the corresponding length parameter\u001B[39;00m\n\u001B[1;32m    103\u001B[0m         \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(seq) \u001B[38;5;241m<\u001B[39m len_:\n",
+      "\u001B[0;31mValueError\u001B[0m: 'jmd_n_seq' should be string (type=<class 'list'>)"
      ]
     }
    ],
    "source": [
-    "# Feature Engineering\n",
-    "y = list(df[\"label\"])\n",
-    "sf = aa.SequenceFeature()\n",
-    "df_parts = sf.get_df_parts(df_seq=df, jmd_n_len=50, jmd_c_len=50)\n",
-    "args = dict(df_scales=df_scales, df_parts=df_parts, accept_gaps=True)\n",
     "# Small set of features (300 features created)\n",
-    "split_kws = sf.get_split_kws(n_split_max=1, split_types=[\"Segment\"])\n",
-    "cpp = aa.CPP(df_cat=df_cat, **args, split_kws=split_kws)\n",
-    "df_feat = cpp.run(labels=y, tmd_len=200, n_processes=8, n_filter=100)"
-   ]
+    "cpp = aa.CPP(df_parts=df_parts, df_cat=df_cat, df_scales=df_scales, split_kws=split_kws)\n",
+    "df_feat = cpp.run(labels=y, tmd_len=20, n_filter=100)  "
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-09-23T14:22:06.231449367Z",
+     "start_time": "2023-09-23T14:21:58.054329311Z"
+    }
+   }
   },
   {
    "cell_type": "markdown",
@@ -130,43 +207,74 @@
     }
    },
    "source": [
-    "# Machine learning\n",
-    "The SequenceFeature class provides as well a method to create a feature matrix from a given set of CPP features."
+    "## 3. Protein Prediction\n",
+    "A feature matrix from a given set of CPP features can be created using ``sf.feat_matrix``:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
+   "execution_count": 13,
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Mean accuracy of 0.55\n"
+     "ename": "NameError",
+     "evalue": "name 'df_feat' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mNameError\u001B[0m                                 Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[13], line 3\u001B[0m\n\u001B[1;32m      1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01msklearn\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mensemble\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m RandomForestClassifier\n\u001B[1;32m      2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01msklearn\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mmodel_selection\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m cross_val_score\n\u001B[0;32m----> 3\u001B[0m X \u001B[38;5;241m=\u001B[39m sf\u001B[38;5;241m.\u001B[39mfeat_matrix(df_parts\u001B[38;5;241m=\u001B[39mdf_parts, df_scales\u001B[38;5;241m=\u001B[39mdf_scales, features\u001B[38;5;241m=\u001B[39m\u001B[43mdf_feat\u001B[49m[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mfeature\u001B[39m\u001B[38;5;124m\"\u001B[39m])\n",
+      "\u001B[0;31mNameError\u001B[0m: name 'df_feat' is not defined"
      ]
     }
    ],
    "source": [
-    "X = sf.feat_matrix(**args, features=df_feat[\"feature\"])\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.model_selection import cross_val_score\n",
+    "X = sf.feat_matrix(df_parts=df_parts, df_scales=df_scales, features=df_feat[\"feature\"])"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-09-23T14:22:13.569330952Z",
+     "start_time": "2023-09-23T14:22:13.527712901Z"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "This feature matrix can now be used for common machine learning models."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
     "# ML evaluation\n",
     "rf = RandomForestClassifier()\n",
     "cv = cross_val_score(rf, X, y, scoring=\"accuracy\", cv=5, n_jobs=8) # Set n_jobs=1 to disable multi-processing\n",
     "print(f\"Mean accuracy of {round(np.mean(cv), 2)}\")"
-   ]
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Creating more initial features will take some more time but improve prediction performance. "
+   ],
+   "metadata": {
+    "collapsed": false
+   }
   },
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
    "outputs": [
     {
      "name": "stdout",
@@ -182,16 +290,32 @@
     }
    ],
    "source": [
-    "# Default set of features (around 100.000 features created)\n",
+    "# Default split settings for features (around 100.000 features created)\n",
     "split_kws = sf.get_split_kws()\n",
-    "cpp = aa.CPP(df_cat=df_cat, **args, split_kws=split_kws)\n",
+    "cpp = aa.CPP(df_cat=df_cat, df_parts=df_parts, df_scales=df_scales, split_kws=split_kws)\n",
     "df_feat = cpp.run(labels=y, tmd_len=200, n_processes=8, n_filter=100)\n",
-    "X = sf.feat_matrix(**args, features=df_feat[\"feature\"])\n",
+    "X = sf.feat_matrix(df_parts=df_parts, df_scales=df_scales, features=df_feat[\"feature\"])\n",
     "# ML evaluation\n",
     "rf = RandomForestClassifier()\n",
     "cv = cross_val_score(rf, X, y, scoring=\"accuracy\", cv=5, n_jobs=1)  # Set n_jobs=1 to disable multi-processing\n",
     "print(f\"Mean accuracy of {round(np.mean(cv), 2)}\")"
-   ]
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 4. Explainable AI\n",
+    "\n",
+    "### Explainable AI on group level\n",
+    "\n",
+    "### Explainable AI on individual level"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
   }
  ],
  "metadata": {
@@ -215,4 +339,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
\ No newline at end of file
+}