Update Table generation, aa.load_dataset documenation (e2e)

breimanntools · Sep 20, 2023 · 58741bb · 58741bb
1 parent dbd0304
commit 58741bb
Show file tree

Hide file tree

Showing 184 changed files with 1,913 additions and 788 deletions.
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
@@ -215,7 +215,21 @@ Documentation Style
   `napoleon <https://sphinxcontrib-napoleon.readthedocs.io/en/latest/#>`_, and
   `sphinx-design <https://sphinx-design.readthedocs.io/en/rtd-theme/>`_ extensions.
 
-- **Further Details**: See `docs/source/conf.py` for more.
+- **Further Details**: See our `conf.py <https://github.com/breimanntools/aaanalysis/blob/master/docs/source/conf.py>`_
+  for more.
+
+Documentation Layers
+---------------------
+This project's documentation is organized across four distinct layers, each with a specific focus and level of detail:
+
+- **Docstrings**: Concise code description, with minimal usage examples and references to other layers (in 'See also').
+
+- **Usage Principles**: Bird's-eye view with background and key principles, reflecting by selected code examples.
+
+- **Tutorial**: Close-up on public interface, as step-by-step guide on essential usage with medium detail.
+
+- **Tables**:  Close-up on data or other tabular overviews, with detailed explanation of columns and critical values.
+
 
 Building the Docs
 -----------------

diff --git a/aaanalysis/__pycache__/utils.cpython-39.pyc b/aaanalysis/__pycache__/utils.cpython-39.pyc
diff --git a/aaanalysis/_data/scale_classification.xlsx → aaanalysis/_data/scales_cat.xlsx b/aaanalysis/_data/scale_classification.xlsx → aaanalysis/_data/scales_cat.xlsx
diff --git a/aaanalysis/_utils/__pycache__/_utils_constants.cpython-39.pyc b/aaanalysis/_utils/__pycache__/_utils_constants.cpython-39.pyc
diff --git a/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc b/aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc
diff --git a/aaanalysis/_utils/_utils_constants.py b/aaanalysis/_utils/_utils_constants.py
@@ -9,7 +9,7 @@
 STR_SCALES = "scales"   # Min-max normalized scales (from AAontology)
 STR_SCALES_RAW = "scales_raw"   # Raw scales (from AAontology)
 STR_SCALES_PC = "scales_pc"     # AAclust pc-based scales (pc: principal component)
-STR_SCALE_CAT = "scale_classification"  # AAontology
+STR_SCALE_CAT = "scales_cat"  # AAontology
 STR_TOP60 = "top60"    # AAclustTop60
 STR_TOP60_EVAL = "top60_eval"  # AAclustTop60 evaluation
 
@@ -23,7 +23,9 @@
 COLS_PARTS = ["jmd_n", "tmd", "jmd_c"]
 COL_TMD_START = "tmd_start"
 COL_TMD_STOP = "tmd_stop"
-COLS_SEQ_INFO = [COL_SEQ, COL_TMD_START, COL_TMD_STOP]  # TODO
+COLS_SEQ_KEY = [COL_ENTRY, COL_SEQ, COL_LABEL]
+COLS_SEQ_TMD_POS_KEY = [COL_SEQ, COL_TMD_START, COL_TMD_STOP]  # TODO adjust to COL_ENTRY
+COLS_SEQ_TMD_PART_KEY = [COL_ENTRY, COL_SEQ] + COLS_PARTS
 # df_part
 
 # df_scales

diff --git a/aaanalysis/_utils/utils_cpp.py b/aaanalysis/_utils/utils_cpp.py
@@ -91,45 +91,44 @@ def check_ylim(df=None, ylim=None, val_col=None, retrieve_plot=False, scaling_fa
 
 
 # Sequence check function
-def _check_seq(seq, len_, name_seq, name_len, verbose):
+def _check_seq(seq, len_, name_seq, name_len):
     """"""
     if seq is None:
         return len_
     else:
-        if type(seq) != str:
+        if not isinstance(seq, str):
             raise ValueError(f"'{name_seq}' should be string (type={type(seq)})")
         if len_ is not None:
             # Waring sequence length doesn't match the corresponding length parameter
-            if len(seq) != len_ and verbose:
-                warning_msg = f"The length of {seq} ({len(seq)}) does not match {name_len} ({len_})."
-                ut_o.print_red(f"Warning: {warning_msg}")
+            if len(seq) < len_:
+                raise ValueError(f"The length of {seq} ({len(seq)}) should be >= {name_len} ({len_}).")
         return len(seq)
 
 
 def check_args_len(tmd_len=None, jmd_n_len=None, jmd_c_len=None, ext_len=None,
-                   tmd_seq=None, jmd_n_seq=None, jmd_c_seq=None, verbose=False,
-                   accept_tmd_none=False):
+                   tmd_seq=None, jmd_n_seq=None, jmd_c_seq=None, accept_tmd_none=False):
     """Check length parameters and if they are matching with sequences if provided"""
     # Check lengths
-    tmd_seq_given = tmd_seq is not None or accept_tmd_none # If tmd_seq is given, tmd_len can be None
+    tmd_seq_given = tmd_seq is not None or accept_tmd_none  # If tmd_seq is given, tmd_len can be None
     ut_check.check_non_negative_number(name="tmd_len", val=tmd_len, accept_none=tmd_seq_given, min_val=1)
     ut_check.check_non_negative_number(name="jmd_n_len", val=jmd_n_len, accept_none=True, min_val=1)
     ut_check.check_non_negative_number(name="jmd_c_len", val=jmd_c_len, accept_none=True, min_val=1)
     ut_check.check_non_negative_number(name="ext_len", val=ext_len, accept_none=True)
+    # Check if lengths and sequences match
+    tmd_len = _check_seq(tmd_seq, tmd_len, "tmd_seq", "tmd_len")
+    jmd_n_len = _check_seq(jmd_n_seq, jmd_n_len, "jmd_n_seq", "jmd_n_len")
+    jmd_c_len = _check_seq(jmd_c_seq, jmd_c_len, "jmd_c_seq", "jmd_c_len")
     # Check if lengths are matching
     if ext_len is not None:
         if jmd_n_len is None:
             raise ValueError(f"'jmd_n_len' should not be None if 'ext_len' ({ext_len}) is given")
         if jmd_c_len is None:
             raise ValueError(f"'jmd_c_len' should not be None if 'ext_len' ({ext_len}) is given")
         if jmd_n_len is not None and ext_len > jmd_n_len:
-            raise ValueError(f"'ext_len' ({ext_len}) must be <= jmd_n_len ({jmd_n_len})")
+            raise ValueError(f"'ext_len' ({ext_len}) must be <= length of jmd_n ({jmd_n_len})")
         if jmd_c_len is not None and ext_len > jmd_c_len:
-            raise ValueError(f"'ext_len' ({ext_len}) must be <= jmd_c_len ({jmd_c_len})")
-    # Check if lengths and sequences match
-    tmd_len = _check_seq(tmd_seq, tmd_len, "tmd_seq", "tmd_len", verbose)
-    jmd_n_len = _check_seq(jmd_n_seq, jmd_n_len, "jmd_n_seq", "jmd_n_len", verbose)
-    jmd_c_len = _check_seq(jmd_c_seq, jmd_c_len, "jmd_c_seq", "jmd_c_len", verbose)
+            raise ValueError(f"'ext_len' ({ext_len}) must be <= length of jmd_c ({jmd_c_len})")
+
     args_len = dict(tmd_len=tmd_len, jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len)
     return args_len
 

diff --git a/aaanalysis/cpp/__pycache__/_feature_pos.cpython-39.pyc b/aaanalysis/cpp/__pycache__/_feature_pos.cpython-39.pyc
diff --git a/aaanalysis/cpp/__pycache__/_part.cpython-39.pyc b/aaanalysis/cpp/__pycache__/_part.cpython-39.pyc
diff --git a/aaanalysis/cpp/__pycache__/cpp.cpython-39.pyc b/aaanalysis/cpp/__pycache__/cpp.cpython-39.pyc
diff --git a/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc b/aaanalysis/cpp/__pycache__/feature.cpython-39.pyc
diff --git a/aaanalysis/cpp/_feature_pos.py b/aaanalysis/cpp/_feature_pos.py
@@ -49,7 +49,7 @@ def get_dict_part_pos(tmd_len=20, jmd_n_len=10, jmd_c_len=10, ext_len=0, start=1
         jmd_n = [i + start for i in jmd_n]
         tmd = [i + start for i in tmd]
         jmd_c = [i + start for i in jmd_c]
-        dict_part_pos = pa.get_dict_part_seq(tmd=tmd, jmd_n=jmd_n, jmd_c=jmd_c, ext_len=ext_len)
+        dict_part_pos = pa.get_dict_part_seq(tmd_seq=tmd, jmd_n_seq=jmd_n, jmd_c_seq=jmd_c, ext_len=ext_len)
         return dict_part_pos
 
     @staticmethod

diff --git a/aaanalysis/cpp/_part.py b/aaanalysis/cpp/_part.py
@@ -18,16 +18,6 @@ def check_parts(tmd=None, jmd_n=None, jmd_c=None):
         raise ValueError("'tmd', 'jmd_n', and 'jmd_c' must be given (should not be None)")
 
 
-def check_ext_len(jmd_n=None, jmd_c=None, ext_len=None):
-    """Check if lengths are matching"""
-    if jmd_n is None or jmd_c is None:
-        raise ValueError(f"'jmd_n' ({jmd_n}) and 'jmd_c' ({jmd_c}) should be given (not None)")
-    if ext_len > len(jmd_n):
-        raise ValueError(f"'ext_len' ({ext_len}) must be <= length of jmd_n ({len(jmd_n)})")
-    if ext_len > len(jmd_c):
-        raise ValueError(f"'ext_len' ({ext_len}) must be <= length of jmd_c ({len(jmd_c)})")
-
-
 # Part helper functions
 def _retrieve_string_starting_at_end(seq, start=None, end=None):
     """Reverse_string_start_end"""
@@ -178,25 +168,25 @@ def create_parts(seq=None, tmd_start=None, tmd_stop=None, jmd_n_len=10, jmd_c_le
         return parts
 
     @staticmethod
-    def get_dict_part_seq(df=None, entry=None, tmd=None, jmd_n=None, jmd_c=None, ext_len=0):
+    def get_dict_part_seq(df=None, entry=None, tmd_seq=None, jmd_n_seq=None, jmd_c_seq=None, ext_len=None):
         """Get dictionary for part to sequence either (a) form df using entry or (b) from sequence.
 
         Parameters
         ----------
         df: df with sequence features
         entry: entry for which dict_part_seq should be created
-        tmd: sequence of TMD
-        jmd_n: sequence of JMD-N
-        jmd_c: sequence of JMD-C
+        tmd_seq: sequence of TMD
+        jmd_n_seq: sequence of JMD-N
+        jmd_c_seq: sequence of JMD-C
         ext_len: length of extending part (starting from C and N terminal part of TMD)
 
         Returns
         -------
         dict_part_seq: dictionary with parts to sequence of parts for given entry
         """
         if not (df is None or entry is None):
-            tmd, jmd_n, jmd_c = _get_parts_from_df(df=df, entry=entry)
-        check_parts(tmd=tmd, jmd_n=jmd_n, jmd_c=jmd_c)
-        check_ext_len(jmd_n=jmd_n, jmd_c=jmd_c, ext_len=ext_len)
-        dict_part_seq = _get_dict_part_seq_from_seq(tmd=tmd, jmd_n=jmd_n, jmd_c=jmd_c, ext_len=ext_len)
+            tmd_seq, jmd_n_seq, jmd_c_seq = _get_parts_from_df(df=df, entry=entry)
+        check_parts(tmd=tmd_seq, jmd_n=jmd_n_seq, jmd_c=jmd_c_seq)
+        ut.check_args_len(jmd_n_seq=jmd_n_seq, jmd_c_seq=jmd_c_seq, ext_len=ext_len, accept_tmd_none=True)
+        dict_part_seq = _get_dict_part_seq_from_seq(tmd=tmd_seq, jmd_n=jmd_n_seq, jmd_c=jmd_c_seq, ext_len=ext_len)
         return dict_part_seq
diff --git a/aaanalysis/cpp/cpp.py b/aaanalysis/cpp/cpp.py
@@ -15,8 +15,8 @@
 
 # Filtering functions
 def _filtering_info(df=None, df_scales=None, check_cat=True):
-    """Get datasets structures for filtering, two dictionaries with feature to scales category resp.
-    feature positions and one datasets frame with paired pearson correlations of all scales"""
+    """Get datasets structures for filtering, two dictionaries with feature to scale category resp.
+    feature positions and one data sets frame with paired pearson correlations of all scales"""
     if check_cat:
         dict_c = dict(zip(df[ut.COL_FEATURE], df["category"]))
     else:

diff --git a/aaanalysis/cpp/feature.py b/aaanalysis/cpp/feature.py
@@ -196,7 +196,7 @@ class SequenceFeature:
 
     # Basic datastructures for features
     @staticmethod
-    def get_df_parts(df_seq=None, list_parts=None, jmd_n_len=None, jmd_c_len=None, ext_len=4, all_parts=False):
+    def get_df_parts(df_seq=None, list_parts=None, jmd_n_len=None, jmd_c_len=None, ext_len=None, all_parts=False):
         """Create DataFrane with sequence parts.
 
         Parameters
@@ -226,36 +226,29 @@ def get_df_parts(df_seq=None, list_parts=None, jmd_n_len=None, jmd_c_len=None, e
 
         Examples
         --------
-        Get sequence parts based on parts columns in df_seq with with 'tmd_e', and 'tmd_jmd' as parts:
+        Get sequence parts from df_seq with 'tmd_e', and 'tmd_jmd' as parts and jmd length of 10:
 
         >>> import aaanalysis as aa
         >>> sf = aa.SequenceFeature()
-        >>> df_seq = aa.load_dataset(name='GSEC_SUB_SEQ')
-        >>> df_parts = sf.get_df_parts(df_seq=df_seq, list_parts=["tmd_e", "tmd_jmd"])
-
-        Get sequence parts based on sequence column in df_seq and jmd_n_len and jmd_c_len with default parts:
-
-        >>> import aaanalysis as aa
-        >>> sf = aa.SequenceFeature()
-        >>> df_seq = aa.load_dataset(name='GSEC_SUB_SEQ')
-        >>> df_parts = sf.get_df_parts(df_seq=df_seq, jmd_n_len=10, jmd_c_len=10)
+        >>> df_seq = aa.load_dataset(name='DOM_GSE')
+        >>> df_parts = sf.get_df_parts(df_seq=df_seq, list_parts=["tmd_e", "tmd_jmd"], jmd_n_len=10, jmd_c_len=10)
         """
         ut.check_args_len(jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len, ext_len=ext_len, accept_tmd_none=True)
         df_seq = ut.check_df_seq(df_seq=df_seq, jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len)
         list_parts = ut.check_list_parts(list_parts=list_parts, all_parts=all_parts)
-        seq_info_in_df = set(ut.COLS_SEQ_INFO).issubset(set(df_seq))
+        seq_info_in_df = set(ut.COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq))
         pa = Parts()
         dict_parts = {}
         for i, row in df_seq.iterrows():
             entry = row[ut.COL_ENTRY]
             if jmd_c_len is not None and jmd_n_len is not None and seq_info_in_df:
-                seq, start, stop = row[ut.COLS_SEQ_INFO].values
+                seq, start, stop = row[ut.COLS_SEQ_TMD_POS_KEY].values
                 parts = pa.create_parts(seq=seq, tmd_start=start, tmd_stop=stop,
                                         jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len)
                 jmd_n, tmd, jmd_c = parts.jmd_n, parts.tmd, parts.jmd_c
             else:
                 jmd_n, tmd, jmd_c = row[ut.COLS_PARTS].values
-            dict_part_seq = pa.get_dict_part_seq(tmd=tmd, jmd_n=jmd_n, jmd_c=jmd_c, ext_len=ext_len)
+            dict_part_seq = pa.get_dict_part_seq(tmd_seq=tmd, jmd_n_seq=jmd_n, jmd_c_seq=jmd_c, ext_len=ext_len)
             dict_part_seq = {part: dict_part_seq[part] for part in list_parts}
             dict_parts[entry] = dict_part_seq
         df_parts = pd.DataFrame.from_dict(dict_parts, orient="index")

diff --git a/aaanalysis/data_loader/__pycache__/data_loader.cpython-39.pyc b/aaanalysis/data_loader/__pycache__/data_loader.cpython-39.pyc