Skip to content

Commit

Permalink
Update Table generation, aa.load_dataset documenation (e2e)
Browse files Browse the repository at this point in the history
  • Loading branch information
breimanntools committed Sep 20, 2023
1 parent dbd0304 commit 58741bb
Show file tree
Hide file tree
Showing 184 changed files with 1,913 additions and 788 deletions.
16 changes: 15 additions & 1 deletion CONTRIBUTING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,21 @@ Documentation Style
`napoleon <https://sphinxcontrib-napoleon.readthedocs.io/en/latest/#>`_, and
`sphinx-design <https://sphinx-design.readthedocs.io/en/rtd-theme/>`_ extensions.

- **Further Details**: See `docs/source/conf.py` for more.
- **Further Details**: See our `conf.py <https://github.com/breimanntools/aaanalysis/blob/master/docs/source/conf.py>`_
for more.

Documentation Layers
---------------------
This project's documentation is organized across four distinct layers, each with a specific focus and level of detail:

- **Docstrings**: Concise code description, with minimal usage examples and references to other layers (in 'See also').

- **Usage Principles**: Bird's-eye view with background and key principles, reflecting by selected code examples.

- **Tutorial**: Close-up on public interface, as step-by-step guide on essential usage with medium detail.

- **Tables**: Close-up on data or other tabular overviews, with detailed explanation of columns and critical values.


Building the Docs
-----------------
Expand Down
Binary file modified aaanalysis/__pycache__/utils.cpython-39.pyc
Binary file not shown.
File renamed without changes.
Binary file modified aaanalysis/_utils/__pycache__/_utils_constants.cpython-39.pyc
Binary file not shown.
Binary file modified aaanalysis/_utils/__pycache__/utils_cpp.cpython-39.pyc
Binary file not shown.
6 changes: 4 additions & 2 deletions aaanalysis/_utils/_utils_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
STR_SCALES = "scales" # Min-max normalized scales (from AAontology)
STR_SCALES_RAW = "scales_raw" # Raw scales (from AAontology)
STR_SCALES_PC = "scales_pc" # AAclust pc-based scales (pc: principal component)
STR_SCALE_CAT = "scale_classification" # AAontology
STR_SCALE_CAT = "scales_cat" # AAontology
STR_TOP60 = "top60" # AAclustTop60
STR_TOP60_EVAL = "top60_eval" # AAclustTop60 evaluation

Expand All @@ -23,7 +23,9 @@
COLS_PARTS = ["jmd_n", "tmd", "jmd_c"]
COL_TMD_START = "tmd_start"
COL_TMD_STOP = "tmd_stop"
COLS_SEQ_INFO = [COL_SEQ, COL_TMD_START, COL_TMD_STOP] # TODO
COLS_SEQ_KEY = [COL_ENTRY, COL_SEQ, COL_LABEL]
COLS_SEQ_TMD_POS_KEY = [COL_SEQ, COL_TMD_START, COL_TMD_STOP] # TODO adjust to COL_ENTRY
COLS_SEQ_TMD_PART_KEY = [COL_ENTRY, COL_SEQ] + COLS_PARTS
# df_part

# df_scales
Expand Down
27 changes: 13 additions & 14 deletions aaanalysis/_utils/utils_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,45 +91,44 @@ def check_ylim(df=None, ylim=None, val_col=None, retrieve_plot=False, scaling_fa


# Sequence check function
def _check_seq(seq, len_, name_seq, name_len, verbose):
def _check_seq(seq, len_, name_seq, name_len):
""""""
if seq is None:
return len_
else:
if type(seq) != str:
if not isinstance(seq, str):
raise ValueError(f"'{name_seq}' should be string (type={type(seq)})")
if len_ is not None:
# Waring sequence length doesn't match the corresponding length parameter
if len(seq) != len_ and verbose:
warning_msg = f"The length of {seq} ({len(seq)}) does not match {name_len} ({len_})."
ut_o.print_red(f"Warning: {warning_msg}")
if len(seq) < len_:
raise ValueError(f"The length of {seq} ({len(seq)}) should be >= {name_len} ({len_}).")
return len(seq)


def check_args_len(tmd_len=None, jmd_n_len=None, jmd_c_len=None, ext_len=None,
tmd_seq=None, jmd_n_seq=None, jmd_c_seq=None, verbose=False,
accept_tmd_none=False):
tmd_seq=None, jmd_n_seq=None, jmd_c_seq=None, accept_tmd_none=False):
"""Check length parameters and if they are matching with sequences if provided"""
# Check lengths
tmd_seq_given = tmd_seq is not None or accept_tmd_none # If tmd_seq is given, tmd_len can be None
tmd_seq_given = tmd_seq is not None or accept_tmd_none # If tmd_seq is given, tmd_len can be None
ut_check.check_non_negative_number(name="tmd_len", val=tmd_len, accept_none=tmd_seq_given, min_val=1)
ut_check.check_non_negative_number(name="jmd_n_len", val=jmd_n_len, accept_none=True, min_val=1)
ut_check.check_non_negative_number(name="jmd_c_len", val=jmd_c_len, accept_none=True, min_val=1)
ut_check.check_non_negative_number(name="ext_len", val=ext_len, accept_none=True)
# Check if lengths and sequences match
tmd_len = _check_seq(tmd_seq, tmd_len, "tmd_seq", "tmd_len")
jmd_n_len = _check_seq(jmd_n_seq, jmd_n_len, "jmd_n_seq", "jmd_n_len")
jmd_c_len = _check_seq(jmd_c_seq, jmd_c_len, "jmd_c_seq", "jmd_c_len")
# Check if lengths are matching
if ext_len is not None:
if jmd_n_len is None:
raise ValueError(f"'jmd_n_len' should not be None if 'ext_len' ({ext_len}) is given")
if jmd_c_len is None:
raise ValueError(f"'jmd_c_len' should not be None if 'ext_len' ({ext_len}) is given")
if jmd_n_len is not None and ext_len > jmd_n_len:
raise ValueError(f"'ext_len' ({ext_len}) must be <= jmd_n_len ({jmd_n_len})")
raise ValueError(f"'ext_len' ({ext_len}) must be <= length of jmd_n ({jmd_n_len})")
if jmd_c_len is not None and ext_len > jmd_c_len:
raise ValueError(f"'ext_len' ({ext_len}) must be <= jmd_c_len ({jmd_c_len})")
# Check if lengths and sequences match
tmd_len = _check_seq(tmd_seq, tmd_len, "tmd_seq", "tmd_len", verbose)
jmd_n_len = _check_seq(jmd_n_seq, jmd_n_len, "jmd_n_seq", "jmd_n_len", verbose)
jmd_c_len = _check_seq(jmd_c_seq, jmd_c_len, "jmd_c_seq", "jmd_c_len", verbose)
raise ValueError(f"'ext_len' ({ext_len}) must be <= length of jmd_c ({jmd_c_len})")

args_len = dict(tmd_len=tmd_len, jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len)
return args_len

Expand Down
Binary file modified aaanalysis/cpp/__pycache__/_feature_pos.cpython-39.pyc
Binary file not shown.
Binary file modified aaanalysis/cpp/__pycache__/_part.cpython-39.pyc
Binary file not shown.
Binary file modified aaanalysis/cpp/__pycache__/cpp.cpython-39.pyc
Binary file not shown.
Binary file modified aaanalysis/cpp/__pycache__/feature.cpython-39.pyc
Binary file not shown.
2 changes: 1 addition & 1 deletion aaanalysis/cpp/_feature_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def get_dict_part_pos(tmd_len=20, jmd_n_len=10, jmd_c_len=10, ext_len=0, start=1
jmd_n = [i + start for i in jmd_n]
tmd = [i + start for i in tmd]
jmd_c = [i + start for i in jmd_c]
dict_part_pos = pa.get_dict_part_seq(tmd=tmd, jmd_n=jmd_n, jmd_c=jmd_c, ext_len=ext_len)
dict_part_pos = pa.get_dict_part_seq(tmd_seq=tmd, jmd_n_seq=jmd_n, jmd_c_seq=jmd_c, ext_len=ext_len)
return dict_part_pos

@staticmethod
Expand Down
26 changes: 8 additions & 18 deletions aaanalysis/cpp/_part.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,6 @@ def check_parts(tmd=None, jmd_n=None, jmd_c=None):
raise ValueError("'tmd', 'jmd_n', and 'jmd_c' must be given (should not be None)")


def check_ext_len(jmd_n=None, jmd_c=None, ext_len=None):
"""Check if lengths are matching"""
if jmd_n is None or jmd_c is None:
raise ValueError(f"'jmd_n' ({jmd_n}) and 'jmd_c' ({jmd_c}) should be given (not None)")
if ext_len > len(jmd_n):
raise ValueError(f"'ext_len' ({ext_len}) must be <= length of jmd_n ({len(jmd_n)})")
if ext_len > len(jmd_c):
raise ValueError(f"'ext_len' ({ext_len}) must be <= length of jmd_c ({len(jmd_c)})")


# Part helper functions
def _retrieve_string_starting_at_end(seq, start=None, end=None):
"""Reverse_string_start_end"""
Expand Down Expand Up @@ -178,25 +168,25 @@ def create_parts(seq=None, tmd_start=None, tmd_stop=None, jmd_n_len=10, jmd_c_le
return parts

@staticmethod
def get_dict_part_seq(df=None, entry=None, tmd=None, jmd_n=None, jmd_c=None, ext_len=0):
def get_dict_part_seq(df=None, entry=None, tmd_seq=None, jmd_n_seq=None, jmd_c_seq=None, ext_len=None):
"""Get dictionary for part to sequence either (a) form df using entry or (b) from sequence.
Parameters
----------
df: df with sequence features
entry: entry for which dict_part_seq should be created
tmd: sequence of TMD
jmd_n: sequence of JMD-N
jmd_c: sequence of JMD-C
tmd_seq: sequence of TMD
jmd_n_seq: sequence of JMD-N
jmd_c_seq: sequence of JMD-C
ext_len: length of extending part (starting from C and N terminal part of TMD)
Returns
-------
dict_part_seq: dictionary with parts to sequence of parts for given entry
"""
if not (df is None or entry is None):
tmd, jmd_n, jmd_c = _get_parts_from_df(df=df, entry=entry)
check_parts(tmd=tmd, jmd_n=jmd_n, jmd_c=jmd_c)
check_ext_len(jmd_n=jmd_n, jmd_c=jmd_c, ext_len=ext_len)
dict_part_seq = _get_dict_part_seq_from_seq(tmd=tmd, jmd_n=jmd_n, jmd_c=jmd_c, ext_len=ext_len)
tmd_seq, jmd_n_seq, jmd_c_seq = _get_parts_from_df(df=df, entry=entry)
check_parts(tmd=tmd_seq, jmd_n=jmd_n_seq, jmd_c=jmd_c_seq)
ut.check_args_len(jmd_n_seq=jmd_n_seq, jmd_c_seq=jmd_c_seq, ext_len=ext_len, accept_tmd_none=True)
dict_part_seq = _get_dict_part_seq_from_seq(tmd=tmd_seq, jmd_n=jmd_n_seq, jmd_c=jmd_c_seq, ext_len=ext_len)
return dict_part_seq
4 changes: 2 additions & 2 deletions aaanalysis/cpp/cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@

# Filtering functions
def _filtering_info(df=None, df_scales=None, check_cat=True):
"""Get datasets structures for filtering, two dictionaries with feature to scales category resp.
feature positions and one datasets frame with paired pearson correlations of all scales"""
"""Get datasets structures for filtering, two dictionaries with feature to scale category resp.
feature positions and one data sets frame with paired pearson correlations of all scales"""
if check_cat:
dict_c = dict(zip(df[ut.COL_FEATURE], df["category"]))
else:
Expand Down
21 changes: 7 additions & 14 deletions aaanalysis/cpp/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ class SequenceFeature:

# Basic datastructures for features
@staticmethod
def get_df_parts(df_seq=None, list_parts=None, jmd_n_len=None, jmd_c_len=None, ext_len=4, all_parts=False):
def get_df_parts(df_seq=None, list_parts=None, jmd_n_len=None, jmd_c_len=None, ext_len=None, all_parts=False):
"""Create DataFrane with sequence parts.
Parameters
Expand Down Expand Up @@ -226,36 +226,29 @@ def get_df_parts(df_seq=None, list_parts=None, jmd_n_len=None, jmd_c_len=None, e
Examples
--------
Get sequence parts based on parts columns in df_seq with with 'tmd_e', and 'tmd_jmd' as parts:
Get sequence parts from df_seq with 'tmd_e', and 'tmd_jmd' as parts and jmd length of 10:
>>> import aaanalysis as aa
>>> sf = aa.SequenceFeature()
>>> df_seq = aa.load_dataset(name='GSEC_SUB_SEQ')
>>> df_parts = sf.get_df_parts(df_seq=df_seq, list_parts=["tmd_e", "tmd_jmd"])
Get sequence parts based on sequence column in df_seq and jmd_n_len and jmd_c_len with default parts:
>>> import aaanalysis as aa
>>> sf = aa.SequenceFeature()
>>> df_seq = aa.load_dataset(name='GSEC_SUB_SEQ')
>>> df_parts = sf.get_df_parts(df_seq=df_seq, jmd_n_len=10, jmd_c_len=10)
>>> df_seq = aa.load_dataset(name='DOM_GSE')
>>> df_parts = sf.get_df_parts(df_seq=df_seq, list_parts=["tmd_e", "tmd_jmd"], jmd_n_len=10, jmd_c_len=10)
"""
ut.check_args_len(jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len, ext_len=ext_len, accept_tmd_none=True)
df_seq = ut.check_df_seq(df_seq=df_seq, jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len)
list_parts = ut.check_list_parts(list_parts=list_parts, all_parts=all_parts)
seq_info_in_df = set(ut.COLS_SEQ_INFO).issubset(set(df_seq))
seq_info_in_df = set(ut.COLS_SEQ_TMD_POS_KEY).issubset(set(df_seq))
pa = Parts()
dict_parts = {}
for i, row in df_seq.iterrows():
entry = row[ut.COL_ENTRY]
if jmd_c_len is not None and jmd_n_len is not None and seq_info_in_df:
seq, start, stop = row[ut.COLS_SEQ_INFO].values
seq, start, stop = row[ut.COLS_SEQ_TMD_POS_KEY].values
parts = pa.create_parts(seq=seq, tmd_start=start, tmd_stop=stop,
jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len)
jmd_n, tmd, jmd_c = parts.jmd_n, parts.tmd, parts.jmd_c
else:
jmd_n, tmd, jmd_c = row[ut.COLS_PARTS].values
dict_part_seq = pa.get_dict_part_seq(tmd=tmd, jmd_n=jmd_n, jmd_c=jmd_c, ext_len=ext_len)
dict_part_seq = pa.get_dict_part_seq(tmd_seq=tmd, jmd_n_seq=jmd_n, jmd_c_seq=jmd_c, ext_len=ext_len)
dict_part_seq = {part: dict_part_seq[part] for part in list_parts}
dict_parts[entry] = dict_part_seq
df_parts = pd.DataFrame.from_dict(dict_parts, orient="index")
Expand Down
Binary file modified aaanalysis/data_loader/__pycache__/data_loader.cpython-39.pyc
Binary file not shown.
Loading

0 comments on commit 58741bb

Please sign in to comment.