Skip to content

Commit

Permalink
Major change of code base (started)
Browse files Browse the repository at this point in the history
  • Loading branch information
breimanntools committed Sep 13, 2023
1 parent 55012cb commit 957943b
Show file tree
Hide file tree
Showing 147 changed files with 3,297 additions and 2,739 deletions.
9 changes: 5 additions & 4 deletions aaanalysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from aaanalysis.data_loader import load_dataset, load_scales
from aaanalysis.aaclust import AAclust
from aaanalysis.cpp import CPP, SequenceFeature, SplitRange
from aaanalysis.cpp import CPP, CPPPlot, SequenceFeature, SplitRange
from aaanalysis.dpulearn import dPULearn
from aaanalysis.utils_plot import plot_settings, plot_set_legend, plot_gcfs
from aaanalysis.plotting import plot_settings, plot_set_legend, plot_gcfs, plot_get_cmap, plot_get_cdict

__all__ = ["load_dataset", "load_scales",
"AAclust",
"CPP", "SequenceFeature", "SplitRange", "dPULearn",
"plot_settings", "plot_set_legend", "plot_gcfs"]
"CPP", "CPPPlot", "SequenceFeature", "SplitRange",
"dPULearn",
"plot_settings", "plot_set_legend", "plot_gcfs", "plot_get_cmap", "plot_get_cdict"]
Binary file modified aaanalysis/__pycache__/__init__.cpython-39.pyc
Binary file not shown.
Binary file removed aaanalysis/__pycache__/_utils.cpython-38.pyc
Binary file not shown.
Binary file removed aaanalysis/__pycache__/_utils.cpython-39.pyc
Binary file not shown.
Binary file added aaanalysis/__pycache__/utils.cpython-39.pyc
Binary file not shown.
Binary file removed aaanalysis/__pycache__/utils_plot.cpython-39.pyc
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
127 changes: 127 additions & 0 deletions aaanalysis/_data/benchmarks/DOM_GSEC.tsv

Large diffs are not rendered by default.

695 changes: 695 additions & 0 deletions aaanalysis/_data/benchmarks/DOM_GSEC_PU.tsv

Large diffs are not rendered by default.

Binary file added aaanalysis/_data/benchmarks/INFO_benchmarks.xlsx
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file added aaanalysis/_utils/__init__.py
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
80 changes: 41 additions & 39 deletions aaanalysis/_utils.py → aaanalysis/_utils/_utils_check.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,11 @@
#! /usr/bin/python3
"""
Config with folder structure
Utility check functions
"""
import os
import platform
from sklearn.utils import check_array, check_consistent_length
import pandas as pd


# Helper Function
def _folder_path(super_folder, folder_name):
"""Modification of separator (OS depending)"""
path = os.path.join(super_folder, folder_name + SEP)
return path


# Folder
SEP = "\\" if platform.system() == "Windows" else "/"
FOLDER_PROJECT = os.path.dirname(os.path.abspath(__file__))
FOLDER_DATA = _folder_path(FOLDER_PROJECT, 'data')
URL_DATA = "https://github.com/breimanntools/aaanalysis/tree/master/aaanalysis/data/"

# Default data for protein analysis
STR_SCALES = "scales" # Min-max normalized scales (from AAontology)
STR_SCALES_RAW = "scales_raw" # Ras scales (from AAontology)
STR_SCALES_PC = "scales_pc" # AAclust pc-based scales (pc: principal component)
STR_SCALE_CAT = "scale_classification" # AAontology
STR_TOP60 = "top60" # AAclustTop60
STR_TOP60_EVAL = "top60_eval" # AAclustTop60 evaluation

# Column names
COL_SCALE_ID = "scale_id"
COL_SEQ = "sequence"
COL_CAT = "category"
COL_SUBCAT = "subcategory"


# General check functions
# Type checking functions
def check_non_negative_number(name=None, val=None, min_val=0, max_val=None, accept_none=False,
just_int=True):
"""Check if value of given name variable is non-negative integer"""
Expand Down Expand Up @@ -108,8 +78,7 @@ def check_tuple(name=None, val=None, n=None):
raise ValueError(error)


# Data check functions
# TODO update
# Array checking functions
def check_feat_matrix(X=None, names=None, labels=None):
"""Check if X and y match (y can be labels or names). Otherwise, transpose X or give error."""
# TODO type check
Expand Down Expand Up @@ -141,7 +110,40 @@ def check_feat_matrix(X=None, y=None):
"""


# Plotting & print functions
def print_red(input_str, **args):
"""Prints the given string in red text."""
print(f"\033[91m{input_str}\033[0m", **args)
# df checking functions
def check_col_in_df(df=None, name_df=None, col=None, col_type=None, accept_nan=False, error_if_exists=False):
"""
Check if the column exists in the DataFrame, if the values have the correct type, and if NaNs are allowed.
"""
# Check if the column already exists and raise error if error_if_exists is True
if error_if_exists and (col in df.columns):
raise ValueError(f"Column '{col}' already exists in '{name_df}'")

# Check if the column exists in the DataFrame
if col not in df.columns:
raise ValueError(f"'{col}' must be a column in '{name_df}': {list(df.columns)}")

# Make col_type a list if it is not already
if col_type is not None and not isinstance(col_type, list):
col_type = [col_type]

# Check if the types match
if col_type is not None:
wrong_types = [x for x in df[col] if not any([isinstance(x, t) for t in col_type])]

# Remove NaNs from the list of wrong types if they are accepted
if accept_nan:
wrong_types = [x for x in wrong_types if not pd.isna(x)]

if len(wrong_types) > 0:
raise ValueError(f"Values in '{col}' should be of type(s) {col_type}, "
f"but the following values do not match: {wrong_types}")

# Check if NaNs are present when they are not accepted
if not accept_nan:
if df[col].isna().sum() > 0:
raise ValueError(f"NaN values are not allowed in '{col}'.")




71 changes: 71 additions & 0 deletions aaanalysis/_utils/_utils_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""
This is a script for constants (e.g., names or column names)
"""


# Default datasets for protein analysis
STR_SCALES = "scales" # Min-max normalized scales (from AAontology)
STR_SCALES_RAW = "scales_raw" # Raw scales (from AAontology)
STR_SCALES_PC = "scales_pc" # AAclust pc-based scales (pc: principal component)
STR_SCALE_CAT = "scale_classification" # AAontology
STR_TOP60 = "top60" # AAclustTop60
STR_TOP60_EVAL = "top60_eval" # AAclustTop60 evaluation

# Column name datasets (DOM_GSEC)
COL_ENTRY = "entry" # ACC, protein entry, uniprot id
COL_NAME = "name" # Entry name, Protein name, Uniprot Name
COL_LABEL = "label"
COL_SEQ = "sequence"
COLS_PARTS = ["jmd_n", "tmd", "jmd_c"]
COL_TMD_START = "tmd_start"
COL_TMD_STOP = "tmd_stop"
COLS_SEQ_INFO = [COL_SEQ, COL_TMD_START, COL_TMD_STOP] # TODO

# Column names scales (scale_classification)
COL_SCALE_ID = "scale_id"
COL_CAT = "category"
COL_SUBCAT = "subcategory"
COL_SCALE_NAME = "scale_name"
COL_SCALE_DES = "scale_description"

# Column names cpp features
COL_FEATURE = "feature"
# COL_CAT, COL_SUBCAT, COL_SCALE_NAME, COL_SCALE_DES
COL_ABS_AUC = "abs_auc"
COL_ABS_MEAN_DIF = "abs_mean_dif"
COL_MEAN_DIF = "mean_dif"
COL_STD_TEST = "std_test"
COL_STD_REF = "std_ref"
COL_PVAL_MW = "p_val_mann_whitney"
COL_PVAL_FDR = "p_val_fdr_bh"
COL_POSITION = "positions"

COL_FEAT_IMPORTANCE = "feat_importance"
COO_FEAT_IMP_STD = "feat_importance_std"
COL_FEAT_IMPACT = "feat_impact"

# Standard colors
COLOR_SHAP_POS = '#FF0D57' # (255, 13, 87)
COLOR_SHAP_NEG = '#1E88E5' # (30, 136, 229)
COLOR_FEAT_POS = '#9D2B39' # (157, 43, 57) Mean difference
COLOR_FEAT_NEG = '#326599' # (50, 101, 133) Mean difference
COLOR_FEAT_IMP = '#7F7F7F' # (127, 127, 127) feature importance
COLOR_TMD = '#00FA9A' # (0, 250, 154)
COLOR_JMD = '#0000FF' # (0, 0, 255)

DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS,
"SHAP_NEG": COLOR_SHAP_NEG,
"FEAT_POS": COLOR_FEAT_POS,
"FEAT_NEG": COLOR_FEAT_NEG,
"FEAT_IMP": COLOR_FEAT_IMP,
"TMD": COLOR_TMD,
"JMD": COLOR_JMD}

DICT_COLOR_CAT = {"ASA/Volume": "tab:blue",
"Composition": "tab:orange",
"Conformation": "tab:green",
"Energy": "tab:red",
"Others": "tab:gray",
"Polarity": "gold",
"Shape": "tab:cyan",
"Structure-Activity": "tab:brown"}
32 changes: 32 additions & 0 deletions aaanalysis/_utils/_utils_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
This is a script for adjust output (mainly for python console)
"""
import numpy as np


# I Helper Functions
# Plotting & print functions
def print_red(input_str, **args):
"""Prints the given string in red text."""
print(f"\033[91m{input_str}\033[0m", **args)


# Progress bar
def print_start_progress():
"""Print start progress"""
progress_bar = " " * 25
print_red(f"\r |{progress_bar}| 0.00%", end="")


def print_progress(i=0, n=0):
"""Print progress"""
progress = min(np.round(i/n * 100, 2), 100)
progress_bar = "#" * int(progress/4) + " " * (25-int(progress/4))
print_red(f"\r |{progress_bar}| {progress:.2f}%", end="")


def print_finished_progress():
"""Print finished progress bar"""
progress_bar = "#" * 25
print_red(f"\r |{progress_bar}| 100.00%")

Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#! /usr/bin/python3
"""
Config with folder structure
Utility functions for AAclust module
"""
import inspect

Expand Down
Loading

0 comments on commit 957943b

Please sign in to comment.