diff --git a/aaanalysis/plotting/__init__.py b/aaanalysis/plotting/__init__.py new file mode 100644 index 00000000..dfa4cf8d --- /dev/null +++ b/aaanalysis/plotting/__init__.py @@ -0,0 +1,4 @@ +from aaanalysis.plotting.plotting_functions import plot_get_cmap, plot_get_cdict, plot_gcfs, \ + plot_settings, plot_set_legend + +__all__ = ["plot_get_cmap", "plot_get_cdict", "plot_settings", "plot_set_legend", "plot_gcfs"] diff --git a/aaanalysis/plotting/plotting_functions.py b/aaanalysis/plotting/plotting_functions.py new file mode 100644 index 00000000..e310225c --- /dev/null +++ b/aaanalysis/plotting/plotting_functions.py @@ -0,0 +1,434 @@ +#! /usr/bin/python3 +""" +Default plotting functions +""" +import seaborn as sns +import matplotlib as mpl +import matplotlib.pyplot as plt +import aaanalysis.utils as ut + + + +LIST_AA_COLOR_PALETTES = ["FEAT", "SHAP", "GGPLOT"] +LIST_AA_COLOR_DICTS = ["DICT_SCALE_CAT", "DICT_COLOR"] +LIST_AA_COLORS = LIST_AA_COLOR_PALETTES + LIST_AA_COLOR_DICTS + +LIST_FONTS = ['Arial', 'Avant Garde', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'DejaVu Sans', + 'Geneva', 'Helvetica', 'Lucid', 'Lucida Grande', 'Verdana'] + + +# Helper functions +def check_font_style(font="Arial"): + """""" + if font not in LIST_FONTS: + error_message = f"'font' ({font}) not in recommended fonts: {LIST_FONTS}. Set font manually by:" \ + f"\n\tplt.rcParams['font.sans-serif'] = '{font}'" + raise ValueError(error_message) + + +def check_fig_format(fig_format="pdf"): + """""" + list_fig_formats = ['eps', 'jpg', 'jpeg', 'pdf', 'pgf', 'png', 'ps', + 'raw', 'rgba', 'svg', 'svgz', 'tif', 'tiff', 'webp'] + ut.check_str(name="fig_format", val=fig_format) + if fig_format not in list_fig_formats: + raise ValueError(f"'fig_format' should be one of following: {list_fig_formats}") + + +def check_grid_axis(grid_axis="y"): + list_grid_axis = ["y", "x", "both"] + if grid_axis not in list_grid_axis: + raise ValueError(f"'grid_axis' ({grid_axis}) should be one of following: {list_grid_axis}") + + +def check_cats(list_cat=None, dict_color=None, labels=None): + """""" + ut.check_dict(name="dict_color", val=dict_color, accept_none=False) + if labels is not None: + if list_cat is not None: + if len(list_cat) != len(labels): + raise ValueError(f"Length of 'list_cat' ({len(list_cat)}) and 'labels' ({len(labels)}) must match") + elif len(dict_color) != len(labels): + raise ValueError(f"Length of 'dict_color' ({len(dict_color)}) and 'labels' ({len(labels)}) must match") + if list_cat is None: + list_cat = list(dict_color.keys()) + else: + raise ValueError("'list_cat' and 'dict_color' should not be None") + return list_cat + + +# Get color maps +def _get_shap_cmap(n_colors=100, facecolor_dark=True): + """Generate a diverging color map for feature values.""" + n = 20 + cmap_low = sns.light_palette(ut.COLOR_SHAP_NEG, input="hex", reverse=True, n_colors=int(n_colors/2)+n) + cmap_high = sns.light_palette(ut.COLOR_SHAP_POS, input="hex", n_colors=int(n_colors/2)+n) + c_middle = [(0, 0, 0)] if facecolor_dark else [cmap_low[-1]] + cmap = cmap_low[0:-n] + c_middle + cmap_high[n:] + return cmap + + +def _get_feat_cmap(n_colors=100, facecolor_dark=False): + """Generate a diverging color map for feature values.""" + n = 5 + cmap = sns.color_palette("RdBu_r", n_colors=n_colors + n * 2) + cmap_low, cmap_high = cmap[0:int((n_colors + n * 2) / 2)], cmap[int((n_colors + n * 2) / 2):] + c_middle = [(0, 0, 0)] if facecolor_dark else [cmap_low[-1]] + cmap = cmap_low[0:-n] + c_middle + cmap_high[n:] + return cmap + + +def _get_ggplot_cmap(n_colors=100): + """Generate a circular GGplot color palette.""" + cmap = sns.color_palette("husl", n_colors) + return cmap + + +def _get_default_colors(name=None, n_colors=100, facecolor_dark=True): + """Retrieve default color maps based on palette name.""" + args = dict(n_colors=n_colors, facecolor_dark=facecolor_dark) + if name == "SHAP": + return _get_shap_cmap(**args) + elif name == "FEAT": + return _get_feat_cmap(**args) + elif name == "GGPLOT": + return _get_ggplot_cmap(n_colors=n_colors) + + +def _get_cmap_with_gap(n_colors=100, color_pos=None, color_neg=None, color_center=None, pct_gap=10, pct_center=None, + input="hex"): + """Generate a custom color map with a gap.""" + n_gap = int(n_colors*pct_gap/2) + cmap_pos = sns.light_palette(color_pos, input=input, n_colors=int(n_colors/2)+n_gap) + cmap_neg = sns.light_palette(color_neg, input=input, reverse=True, n_colors=int(n_colors/2)+n_gap) + color_center = [cmap_neg[-1]] if color_center is None else color_center + color_center = [color_center] if type(color_center) is str else color_center + if pct_center is None: + cmap = cmap_neg[0:-n_gap] + color_center + cmap_pos[n_gap:] + else: + n_center = int(n_colors * pct_center) + n_gap += int(n_center/2) + cmap = cmap_neg[0:-n_gap] + color_center * n_center + cmap_pos[n_gap:] + return cmap + + +# Default plotting function +def plot_get_cmap(name=None, n_colors=100, facecolor_dark=False, + color_pos=None, color_neg=None, color_center=None, + input="hex", pct_gap=10, pct_center=None): + """ + Retrieve color maps or color dictionaries specified for AAanalysis. + + Parameters + ---------- + name : str, optional + The name of the color palette to use in AAanalysis. Options include: + - 'SHAP', 'FEAT', 'GGPLOT': Return color maps for SHAP plots, CPP feature maps/heatmaps, + and datagrouping as in GGplot, respectively. + - 'DICT_COLOR', 'DICT_SCALE_CAT': Return default color dictionaries for plots (e.g., bars in CPPPlot.profile) + and scale categories (e.g., CPPPlot.heatmap), respectively. + n_colors : int, default=100 + Number of colors in the color map. + facecolor_dark : bool, default=False + Whether to use a dark face color for 'SHAP' and 'FEAT'. + color_pos : str, optional + Hex code for the positive color. + color_neg : str, optional + Hex code for the negative color. + color_center : str or list, optional + Hex code or list for the center color. + input : str, {'rgb', 'hls', 'husl', 'xkcd'} + Color space to interpret the input color. The first three options + apply to tuple inputs and the latter applies to string inputs. + pct_gap : int, default=10 + Percentage size of the gap between color ranges. + pct_center : float, optional + Percentage size of the center color in the map. + + Returns + ------- + cmap : list or dict + If 'name' parameter is 'SHAP', 'FEAT', or 'GGPLOT', a list of colors specified for AAanalysis will be returned. + If 'name' parameter is None, a list of colors based on provided colors + + See Also + -------- + sns.color_palette : Function to generate a color palette in seaborn. + sns.light_palette : Function to generate a lighter color palette in seaborn. + """ + # TODO check color dict name + if name in LIST_AA_COLOR_PALETTES: + cmap = _get_default_colors(name=name, n_colors=n_colors, facecolor_dark=facecolor_dark) + return cmap + cmap = _get_cmap_with_gap(n_colors=n_colors, color_pos=color_pos, color_neg=color_neg, + color_center=color_center, pct_gap=pct_gap, pct_center=pct_center, + input=input) + return cmap + + +def plot_get_cdict(name=None): + """ + Retrieve color dictionaries specified for AAanalysis. + + Parameters + ---------- + name : str, {'DICT_COLOR', 'DICT_SCALE_CAT'} + The name of default color dictionaries for plots (e.g., bars in CPPPlot.profile) + and scale categories (e.g., CPPPlot.heatmap), respectively. + + Returns + ------- + cmap : dict + Specific AAanalysis color dictionary. + """ + # TODO check color dict name + color_dict = ut.DICT_COLOR if name == "DICT_COLORS" else ut.DICT_COLOR_CAT + return color_dict + + +def plot_settings(fig_format="pdf", verbose=False, grid=False, grid_axis="y", + font_scale=0.7, font="Arial", + change_size=True, weight_bold=True, adjust_elements=True, + short_ticks=False, no_ticks=False, + no_ticks_y=False, short_ticks_y=False, no_ticks_x=False, short_ticks_x=False): + """ + Configure general settings for plot visualization with various customization options. + + Parameters + ---------- + fig_format : str, default='pdf' + Specifies the file format for saving the plot. + verbose : bool, default=False + If True, enables verbose output. + grid : bool, default=False + If True, makes the grid visible. + grid_axis : str, default='y' + Choose the axis ('y', 'x', 'both') to apply the grid to. + font_scale : float, default=0.7 + Sets the scale for font sizes in the plot. + font : str, default='Arial' + Name of sans-serif font (e.g., 'Arial', 'Verdana', 'Helvetica', 'DejaVu Sans') + change_size : bool, default=True + If True, adjusts the size of plot elements. + weight_bold : bool, default=True + If True, text elements appear in bold. + adjust_elements : bool, default=True + If True, makes additional visual and layout adjustments to the plot. + short_ticks : bool, default=False + If True, uses short tick marks. + no_ticks : bool, default=False + If True, removes all tick marks. + no_ticks_y : bool, default=False + If True, removes tick marks on the y-axis. + short_ticks_y : bool, default=False + If True, uses short tick marks on the y-axis. + no_ticks_x : bool, default=False + If True, removes tick marks on the x-axis. + short_ticks_x : bool, default=False + If True, uses short tick marks on the x-axis. + + Notes + ----- + This function modifies the global settings of Matplotlib and Seaborn libraries. + + Examples + -------- + >>> import aaanalysis as aa + >>> aa.plot_settings(fig_format="pdf", font_scale=1.0, weight_bold=False) + """ + # Check input + check_fig_format(fig_format=fig_format) + check_font_style(font=font) + check_grid_axis(grid_axis=grid_axis) + args_bool = {"verbose": verbose, "grid": grid, "change_size": change_size, "weight_bold": weight_bold, + "adjust_elements": adjust_elements, + "short_ticks": short_ticks, "no_ticks": no_ticks, "no_ticks_y": no_ticks_y, + "short_ticks_y": short_ticks_y, "no_ticks_x": no_ticks_x, "short_ticks_x": short_ticks_x} + for key in args_bool: + ut.check_bool(name=key, val=args_bool[key]) + ut.check_non_negative_number(name="font_scale", val=font_scale, min_val=0, just_int=False) + + # Set embedded fonts in PDF + mpl.rcParams.update(mpl.rcParamsDefault) + mpl.rcParams["pdf.fonttype"] = 42 + mpl.rcParams["pdf.fonttype"] = 42 + if verbose: + print(plt.rcParams.keys) # Print all plot settings that can be modified in general + if not change_size: + plt.rcParams["font.family"] = "sans-serif" + plt.rcParams["font.sans-serif"] = font + mpl.rc('font', **{'family': font}) + return + sns.set_context("talk", font_scale=font_scale) # Font settings https://matplotlib.org/3.1.1/tutorials/text/text_props.html + plt.rcParams["font.family"] = "sans-serif" + plt.rcParams["font.sans-serif"] = font + if weight_bold: + plt.rcParams["axes.labelweight"] = "bold" + plt.rcParams["axes.titleweight"] = "bold" + else: + plt.rcParams["axes.linewidth"] = 1 + plt.rcParams["xtick.major.width"] = 0.8 + plt.rcParams["xtick.minor.width"] = 0.6 + plt.rcParams["ytick.major.width"] = 0.8 + plt.rcParams["ytick.minor.width"] = 0.6 + if short_ticks: + plt.rcParams["xtick.major.size"] = 3.5 + plt.rcParams["xtick.minor.size"] = 2 + plt.rcParams["ytick.major.size"] = 3.5 + plt.rcParams["ytick.minor.size"] = 2 + if short_ticks_x: + plt.rcParams["xtick.major.size"] = 3.5 + plt.rcParams["xtick.minor.size"] = 2 + if short_ticks_y: + plt.rcParams["ytick.major.size"] = 3.5 + plt.rcParams["ytick.minor.size"] = 2 + if no_ticks: + plt.rcParams["xtick.major.size"] = 0 + plt.rcParams["xtick.minor.size"] = 0 + plt.rcParams["ytick.major.size"] = 0 + plt.rcParams["ytick.minor.size"] = 0 + if no_ticks_x: + plt.rcParams["xtick.major.size"] = 0 + plt.rcParams["xtick.minor.size"] = 0 + if no_ticks_y: + plt.rcParams["ytick.major.size"] = 0 + plt.rcParams["ytick.minor.size"] = 0 + + plt.rcParams["axes.labelsize"] = 17 #13.5 + plt.rcParams["axes.titlesize"] = 16.5 #15 + if fig_format == "pdf": + mpl.rcParams['pdf.fonttype'] = 42 + elif "svg" in fig_format: + mpl.rcParams['svg.fonttype'] = 'none' + font = {'family': font, "weight": "bold"} if weight_bold else {"family": font} + mpl.rc('font', **font) + if adjust_elements: + # Error bars + plt.rcParams["errorbar.capsize"] = 10 # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.errorbar.html + # Grid + plt.rcParams["axes.grid.axis"] = grid_axis # 'y', 'x', 'both' + plt.rcParams["axes.grid"] = grid + # Legend + plt.rcParams["legend.frameon"] = False + plt.rcParams["legend.fontsize"] = "medium" #"x-small" + plt.rcParams["legend.loc"] = 'upper right' # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.legend.html + + +def plot_gcfs(): + """Get current font size, which is set by ut.plot_settings function""" + # Get the current plotting context + current_context = sns.plotting_context() + font_size = current_context['font.size'] + return font_size + + +def plot_set_legend(ax=None, handles=None, dict_color=None, list_cat=None, labels=None, y=-0.2, x=0.5, ncol=3, + fontsize=11, weight="normal", lw=0, edgecolor=None, return_handles=False, loc="upper left", + labelspacing=0.2, columnspacing=1, title=None, fontsize_legend=None, title_align_left=True, + fontsize_weight="normal", shape=None, **kwargs): + """ + Set a customizable legend for a plot. + + Parameters + ---------- + ax : matplotlib.axes.Axes, default=None + The axes to attach the legend to. + handles : list, default=None + Handles for legend items. + dict_color : dict, default=None + A dictionary mapping categories to colors. + list_cat : list, default=None + List of categories to include in the legend. + labels : list, default=None + Labels for legend items. + y : float, default=-0.2 + The y-coordinate for the legend's anchor point. + x : float, default=0.5 + The x-coordinate for the legend's anchor point. + ncol : int, default=3 + Number of columns in the legend. + fontsize : int, default=11 + Font size for the legend text. + weight : str, default='normal' + Weight of the font. + lw : float, default=0 + Line width for legend items. + edgecolor : color, default=None + Edge color for legend items. + return_handles : bool, default=False + Whether to return handles and labels. + loc : str, default='upper left' + Location for the legend. + labelspacing : float, default=0.2 + Vertical spacing between legend items. + columnspacing : int, default=1 + Horizontal spacing between legend columns. + title : str, default=None + Title for the legend. + fontsize_legend : int, default=None + Font size for the legend title. + title_align_left : bool, default=True + Whether to align the title to the left. + fontsize_weight : str, default='normal' + Font weight for the legend title. + shape : str, default=None + Marker shape for legend items. + **kwargs : dict + Additional arguments passed directly to ax.legend() for finer control. + + Returns + ------- + ax : matplotlib.axes.Axes + The axes with the legend applied. + + See Also + -------- + matplotlib.pyplot.legend : For additional details on how the 'loc' parameter can be customized. + matplotlib.lines.Line2D : For additional details on the different types of marker shapes ('shape' parameter). + + Examples + -------- + >>> import aaanalysis as aa + >>> aa.plot_set_legend(ax=ax, dict_color={'Cat1': 'red', 'Cat2': 'blue'}, shape='o') + """ + # Check input + if ax is None: + ax = plt.gca() + list_cat = check_cats(list_cat=list_cat, dict_color=dict_color, labels=labels) + args_float = {"y": y, "x": x, "lw": lw, "labelspacing": labelspacing, + "columnspacing": columnspacing} + for key in args_float: + ut.check_float(name=key, val=args_float[key]) + ut.check_non_negative_number(name="ncol", val=ncol, min_val=1, just_int=True, accept_none=False) + ut.check_non_negative_number(name="ncol", val=ncol, min_val=0, just_int=False, accept_none=True) + ut.check_bool(name="return_handles", val=return_handles) + ut.check_bool(name="title_align_left", val=title_align_left) + # TODO check other args + # Prepare the legend handles + dict_leg = {cat: dict_color[cat] for cat in list_cat} + # Generate function for legend markers based on provided shape + if shape is None: + if edgecolor is None: + f = lambda l, c: mpl.patches.Patch(facecolor=l, label=c, lw=lw, edgecolor=l) + else: + f = lambda l, c: mpl.patches.Patch(facecolor=l, label=c, lw=lw, edgecolor=edgecolor) + else: + f = lambda l, c: plt.Line2D([0], [0], marker=shape, color='w', markerfacecolor=l, markersize=10, label=c) + # Create handles if not provided + handles = [f(l, c) for c, l in dict_leg.items()] if handles is None else handles + # Return handles and labels if required + if return_handles: + return handles, labels + # Prepare labels and args + if labels is None: + labels = list(dict_leg.keys()) + args = dict(prop={"weight": weight, "size": fontsize}, **kwargs) + if fontsize_legend is not None: + args["title_fontproperties"] = {"weight": fontsize_weight, "size": fontsize_legend} + # Create the legend + legend = ax.legend(handles=handles, labels=labels, bbox_to_anchor=(x, y), ncol=ncol, loc=loc, + labelspacing=labelspacing, columnspacing=columnspacing, borderpad=0, **args, title=title) + # Align the title if required + if title_align_left: + legend._legend_box.align = "left" + return ax diff --git a/docs/source/_index/tables.rst b/docs/source/_index/tables.rst new file mode 100644 index 00000000..9e4ceec9 --- /dev/null +++ b/docs/source/_index/tables.rst @@ -0,0 +1,246 @@ +.. + Developer Notes: + This is the index file for all tables of the AAanalysis documentation. Each table should be saved the /tables + directory. This file will serve as template for tables.rst, which is automatically created on the information + provided here and in the .csv tables from the /tables directory. Add a new table as .csv in the /tables directory, + in the overview table at the beginning of this document, and a new section with a short description of it in this + document. Each column and important data types (e.g., categories) should be described. Each table should contain a + 'Reference' column. + Ignore 'tables_template.rst: WARNING: document isn't included in any toctree' warning + +Tables +====================== + +.. contents:: + :local: + :depth: 1 + +Overview Table +-------------- +All tables from the AAanalysis documentation are given here in chronological order of the project history. + +.. _0_mapper: +.. list-table:: + :header-rows: 1 + :widths: 8 8 8 + + * - Table + - Description + - See also + * - 1_overview_benchmarks + - Protein benchmark datasets + - aa.load_dataset + * - 2_overview_scales + - Amino acid scale datasets + - aa.load_scales + + +Protein benchmark datasets +-------------------------- +Three types of benchmark datasets are provided: + +- Residue prediction (AA): Datasets used to predict residue (amino acid) specific properties. +- Domain prediction (DOM): Dataset used to predict domain specific properties. +- Sequence prediction (SEQ): Datasets used to predict sequence specific properties. + +The classification of each dataset is indicated as first part of their name followed by an abbreviation for the +specific dataset (e.g., 'AA_LDR', 'DOM_GSEC', 'SEQ_AMYLO'). For some datasets, an additional version of it is provided +for positive-unlabeled (PU) learning containing only positive (1) and unlabeled (2) data samples, as indicated by +*dataset_name_PU* (e.g., 'DOM_GSEC_PU'). + +.. _1_overview_benchmarks: +.. list-table:: + :header-rows: 1 + :widths: 8 8 8 8 8 8 8 8 8 8 + + * - Level + - Dataset + - # Sequences + - # Amino acids + - # Positives + - # Negatives + - Predictor + - Description + - Reference + - Label + * - Amino acid + - AA_CASPASE3 + - 233 + - 185605 + - 705 + - 184900 + - PROSPERous + - Prediction of caspase-3 cleavage site + - :ref:`Song18 ` + - 1 (adjacent to cleavage site), 0 (not adjacent to cleavage site) + * - Amino acid + - AA_FURIN + - 71 + - 59003 + - 163 + - 58840 + - PROSPERous + - Prediction of furin cleavage site + - :ref:`Song18 ` + - 1 (adjacent to cleavage site), 0 (not adjacent to cleavage site) + * - Amino acid + - AA_LDR + - 342 + - 118248 + - 35469 + - 82779 + - IDP-Seq2Seq + - Prediction of long intrinsically disordered regions (LDR) + - :ref:`Tang20 ` + - 1 (disordered), 0 (ordered) + * - Amino acid + - AA_MMP2 + - 573 + - 312976 + - 2416 + - 310560 + - PROSPERous + - Prediction of Matrix metallopeptidase-2 (MMP2) cleavage site + - :ref:`Song18 ` + - 1 (adjacent to cleavage site), 0 (not adjacent to cleavage site) + * - Amino acid + - AA_RNABIND + - 221 + - 55001 + - 6492 + - 48509 + - GMKSVM-RU + - Prediction of RNA-binding protein residues (RBP60 dataset) + - :ref:`Yang21 ` + - 1 (binding), 0 (non-binding) + * - Amino acid + - AA_SA + - 233 + - 185605 + - 101082 + - 84523 + - PROSPERous + - Prediction of solvent accessibility (SA) of residue (AA_CASPASE3 data set) + - :ref:`Song18 ` + - 1 (exposed/accessible), 0 (buried/non-accessible) + * - Sequence + - SEQ_AMYLO + - 1414 + - 8484 + - 511 + - 903 + - ReRF-Pred + - Prediction of amyloidognenic regions + - :ref:`Teng21 ` + - 1 (amyloidogenic), 0 (non-amyloidogenic) + * - Sequence + - SEQ_CAPSID + - 7935 + - 3364680 + - 3864 + - 4071 + - VIRALpro + - Prediction of capdsid proteins + - :ref:`Galiez16 ` + - 1 (capsid protein), 0 (non-capsid protein) + * - Sequence + - SEQ_DISULFIDE + - 2547 + - 614470 + - 897 + - 1650 + - Dipro + - Prediction of disulfide bridges in sequences + - :ref:`Cheng06 ` + - 1 (sequence with SS bond), 0 (sequence without SS bond) + * - Sequence + - SEQ_LOCATION + - 1835 + - 732398 + - 1045 + - 790 + - nan + - Prediction of subcellular location of protein (cytoplasm vs plasma membrane) + - :ref:`Shen19 ` + - 1 (protein in cytoplasm), 0 (protein in plasma membrane) + * - Sequence + - SEQ_SOLUBLE + - 17408 + - 4432269 + - 8704 + - 8704 + - SOLpro + - Prediction of soluble and insoluble proteins + - :ref:`Magnan09 ` + - 1 (soluble), 0 (insoluble) + * - Sequence + - SEQ_TAIL + - 6668 + - 2671690 + - 2574 + - 4094 + - VIRALpro + - Prediction of tail proteins + - :ref:`Galiez16 ` + - 1 (tail protein), 0 (non-tail protein) + * - Domain + - DOM_GSEC + - 126 + - 92964 + - 63 + - 63 + - nan + - Prediction of gamma-secretase substrates + - :ref:`Breimann23c ` + - 1 (substrate), 0 (non-substrate) + * - Domain + - DOM_GSEC_PU + - 694 + - 494524 + - 63 + - 0 + - nan + - Prediction of gamma-secretase substrates (PU dataset) + - :ref:`Breimann23c ` + - 1 (substrate), 2 (unknown substrate status) + + +Amino acid scale datasets +------------------------- +Different amino acid scale datasets are provided + +.. _2_overview_scales: +.. list-table:: + :header-rows: 1 + :widths: 8 8 8 8 + + * - Dataset + - Description + - # Scales + - Reference + * - scales + - Amino acid scales (min-max normalized) + - 586 + - :ref:`Breimann23b ` + * - scales_raw + - Amino acid scales (raw values) + - 586 + - :ref:`Kawashima08 ` + * - scales_classification + - Classification of scales (Aaontology) + - 586 + - :ref:`Breimann23b ` + * - scales_pc + - Principal component (PC) compressed scales + - 20 + - :ref:`Breimann23a ` + * - top60 + - Top 60 scale subsets + - 60 + - :ref:`Breimann23a ` + * - top60_eval + - Evaluation of top 60 scale subsets + - 60 + - :ref:`Breimann23a ` + + diff --git a/docs/source/_index/tables/0_mapper.xlsx b/docs/source/_index/tables/0_mapper.xlsx new file mode 100644 index 00000000..7ff00447 Binary files /dev/null and b/docs/source/_index/tables/0_mapper.xlsx differ diff --git a/docs/source/_index/tables/1_overview_benchmarks.xlsx b/docs/source/_index/tables/1_overview_benchmarks.xlsx new file mode 100644 index 00000000..232d82b8 Binary files /dev/null and b/docs/source/_index/tables/1_overview_benchmarks.xlsx differ diff --git a/docs/source/_index/tables/2_overview_scales.xlsx b/docs/source/_index/tables/2_overview_scales.xlsx new file mode 100644 index 00000000..4565face Binary files /dev/null and b/docs/source/_index/tables/2_overview_scales.xlsx differ diff --git a/docs/source/conf.py b/docs/source/conf.py index 347a8c15..1485d697 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -9,7 +9,7 @@ sys.path.append(os.path.abspath('.')) -#from create_tables_doc import generate_table_rst +from create_tables_doc import generate_table_rst # -- Path and Platform setup -------------------------------------------------- SEP = "\\" if platform.system() == "Windows" else "/" @@ -172,7 +172,7 @@ ] # Create table.rst -#generate_table_rst() +generate_table_rst() # -- Linkcode configuration --------------------------------------------------- _module_path = os.path.dirname(importlib.util.find_spec("aaanalysis").origin) # type: ignore diff --git a/docs/source/index.rst b/docs/source/index.rst index 2206616b..57d04826 100755 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -5,6 +5,7 @@ Welcome to the AAanalysis documentation ======================================= +.. include:: index/badges.rst .. include:: index/overview.rst Install @@ -24,12 +25,14 @@ Install :caption: OVERVIEW index/introduction.rst + index/usage_principles.rst index/CONTRIBUTING_COPY.rst .. toctree:: :maxdepth: 1 :caption: EXAMPLES + tutorials.rst .. toctree:: :maxdepth: 2 @@ -40,6 +43,7 @@ Install .. toctree:: :maxdepth: 1 + _index/tables.rst index/references.rst Indices and tables diff --git a/docs/source/index/tables_template.rst b/docs/source/index/tables_template.rst new file mode 100755 index 00000000..13007991 --- /dev/null +++ b/docs/source/index/tables_template.rst @@ -0,0 +1,44 @@ +.. + Developer Notes: + This is the index file for all tables of the AAanalysis documentation. Each table should be saved the /tables + directory. This file will serve as template for tables.rst, which is automatically created on the information + provided here and in the .csv tables from the /tables directory. Add a new table as .csv in the /tables directory, + in the overview table at the beginning of this document, and a new section with a short description of it in this + document. Each column and important data types (e.g., categories) should be described. Each table should contain a + 'Reference' column. + Ignore 'tables_template.rst: WARNING: document isn't included in any toctree' warning + +Tables +====================== + +.. contents:: + :local: + :depth: 1 + +Overview Table +-------------- +All tables from the AAanalysis documentation are given here in chronological order of the project history. + +.. _0_mapper: + +Protein benchmark datasets +-------------------------- +Three types of benchmark datasets are provided: + +- Residue prediction (AA): Datasets used to predict residue (amino acid) specific properties. +- Domain prediction (DOM): Dataset used to predict domain specific properties. +- Sequence prediction (SEQ): Datasets used to predict sequence specific properties. + +The classification of each dataset is indicated as first part of their name followed by an abbreviation for the +specific dataset (e.g., 'AA_LDR', 'DOM_GSEC', 'SEQ_AMYLO'). For some datasets, an additional version of it is provided +for positive-unlabeled (PU) learning containing only positive (1) and unlabeled (2) data samples, as indicated by +*dataset_name_PU* (e.g., 'DOM_GSEC_PU'). + +.. _1_overview_benchmarks: + +Amino acid scale datasets +------------------------- +Different amino acid scale datasets are provided + +.. _2_overview_scales: + diff --git a/docs/source/index/usage_principles.rst b/docs/source/index/usage_principles.rst new file mode 100755 index 00000000..c86f9432 --- /dev/null +++ b/docs/source/index/usage_principles.rst @@ -0,0 +1,22 @@ +.. Developer Notes: + This is the index file for usage principles. Files for each part are saved in the /usage_principles directory + and the overview the AAanalysis package is given as component diagram (internal dependencies) and context diagram + (external dependencies). Always give the concise code examples reflecting the usage examples. Instead of including + comprehensive tables here, add them in tables.rst and refer to them with a short explanation + +Usage Principles +================ +Import AAanalysis as: + +.. code-block:: python + + import aaanalysis as aa + +.. toctree:: + :maxdepth: 1 + + usage_principles/data_flow_entry_points + usage_principles/aaontology + usage_principles/feature_identification + usage_principles/pu_learning + usage_principles/xai diff --git a/docs/source/index/usage_principles/aaontology.rst b/docs/source/index/usage_principles/aaontology.rst new file mode 100755 index 00000000..90620117 --- /dev/null +++ b/docs/source/index/usage_principles/aaontology.rst @@ -0,0 +1,5 @@ +AAontology: Classification of amino acid scales +=============================================== + +AAontology is a two-level classification of amino acid scale, introduced in. + diff --git a/docs/source/index/usage_principles/data_flow_entry_points.rst b/docs/source/index/usage_principles/data_flow_entry_points.rst new file mode 100755 index 00000000..8e8af181 --- /dev/null +++ b/docs/source/index/usage_principles/data_flow_entry_points.rst @@ -0,0 +1,8 @@ +Data Flow and Enry Points +========================= + +The AAanalysis toolkit uses different DataFrames starting from DataFrames containing amino acid scales information +(df_scales, df_cat) or sequence information (df_seq), which can be modified to obtain specific sequence parts (df_parts). +Amino acid scales and sequence parts together with split settings are the input for the CPP algorithm, creating +various physicochemical features (df_feat) by comparing two sets of protein sequences. + diff --git a/docs/source/index/usage_principles/feature_identification.rst b/docs/source/index/usage_principles/feature_identification.rst new file mode 100755 index 00000000..27b8acf1 --- /dev/null +++ b/docs/source/index/usage_principles/feature_identification.rst @@ -0,0 +1,7 @@ +Identifying Physicochemical Signatures using CPP +================================================ + +The central algorithm of the AAanalysis platform is Comparative Physicochemical Profiling (CPP), a novel sequence-based +feature engineering algorithm, designed to enable interpretable protein prediction. + + diff --git a/docs/source/index/usage_principles/pu_learning.rst b/docs/source/index/usage_principles/pu_learning.rst new file mode 100755 index 00000000..5020f5a7 --- /dev/null +++ b/docs/source/index/usage_principles/pu_learning.rst @@ -0,0 +1,17 @@ +Learning from unbalanced and small data +======================================= + +Unbalanced and small datasets are everywhere in life science .... + +In a standard binary classification setup, data with positive (1) and negative (0) labels are provided, which can be +used for training by machine learning models. If only a view samples of the negative class exist, data augmentation +techniques (e.g., SMOTE) can be used to extend the negative dataset by artificially generated sequences. Such approaches +are very popular for deep learning-based image recognition, but not feasible for protein sequence prediction tasks +because slight amino acid mutations (sequence alterations or perturbations) can already have dramatic biological effects. +Alternatively, negatives samples can be identified from unlabeled samples (2), which often exist in great quantities. +These unlabeled samples should be biologically as similar as possible to the positive class, beside not containing +the features distinguishing the positive from the negative class. For example, . + +What is PU learning? +-------------------- +Positive Unlabeled (PU) learning is a subfield of machine learning ... \ No newline at end of file diff --git a/docs/source/index/usage_principles/xai.rst b/docs/source/index/usage_principles/xai.rst new file mode 100755 index 00000000..8357963d --- /dev/null +++ b/docs/source/index/usage_principles/xai.rst @@ -0,0 +1,8 @@ +Explainable AI at Sequence Level +================================ + +Unbalanced and small datasets are everywhere in life science ... + +What is explainable AI? +----------------------- + diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..78d9b423 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,47 @@ +""" +A summary of different testing strategies is provided as general background: + +1. Testing pyramid: Unit test (unit) >> Integration test (integration) >> System/End-to-End test (e2e) + a) Unit test: Check small bit of code (e.g., function) in isolation + b) Integration/Regression test: Check a larger bit of code (e.g., several classes) + Integration with external components/Sequence regression of internal calls + c) System test: Check whole system in different environments + +2. Positive vs negative testing + a) Positive unit testing: Check if code runs with valid input + b) Negative testing: Check if code troughs error with invalid input + +3. Additional test strategies + a) Property-Based Testing: Validate assumptions (hypothesis) of code using automatically generated data + "Complementary to unit testing" (p. 224-230, The Pragmatic Programmer) + b) Functional test: Check single bit of functionality in a system (similar to regression test?) + Unit test vs. functional test (Code is doing things right vs. Code is doing right things) + +Notes +----- +Recommended testing commands: + a) General: pytest -v -p no:warnings --tb=no test_cpp.py {line, short} + b) Function: pytest -v -p no:warnings --tb=no test_cpp.py::TestCPP:test_add_stat + c) Doctest: pytest -v --doctest-modules -p no:warnings cpp_tools/feature.py + d) Last failed: pytest --lf + +Recommended testing pattern: GIVEN, WHEN, THEN + +Recommended testing tools for pytest (given page from Brian, 2017): + a) Fixtures in conftest file (p. 50) + b) Parametrized Fixtures (p. 64) + c) Testing doctest namespace (p. 89) + +Following other testing tools are used: + a) Coverage.py: Determine how much code is tested (via pytest --cov=cpp_tools) (p. 126, Brian, 2017) + b) tox: Testing multiple configuration + c) hypothesis: Testing tool for property-based testing + +References +---------- +Brian Okken, Python Testing with pytest, The Pragmatic Programmers (2017) +David Thomas & Andrew Hunt, The Pragmatic Programmer, 20th Anniversary Edition (2019) + pp. 224-231 +David R. Maclver, Zac Hatfield-Dodds, ..., Hypothesis: A new approach to property-based testing (2019) +Harry Percival & Bob Gergory, Architecture Patterns with Python (2020) +""" diff --git a/tests/_data/cpp_features.xlsx b/tests/_data/cpp_features.xlsx new file mode 100644 index 00000000..576c5a19 Binary files /dev/null and b/tests/_data/cpp_features.xlsx differ diff --git a/tests/_utils.py b/tests/_utils.py new file mode 100644 index 00000000..205b41f5 --- /dev/null +++ b/tests/_utils.py @@ -0,0 +1,35 @@ +""" +File for testing utility functions and constants. +""" + +import os +import platform +from pathlib import Path + + +# Helper Function +def _folder_path(super_folder, folder_name): + """Modification of separator (OS depending)""" + path = os.path.join(super_folder, folder_name + SEP) + return path + + +# Folder +SEP = "\\" if platform.system() == "Windows" else "/" +FOLDER_PROJECT = str(Path(__file__).parent.parent).replace('/', SEP) + SEP +FOLDER_PROJECT += "tests" + SEP +FOLDER_RESULTS = _folder_path(FOLDER_PROJECT, 'results') +FOLDER_DATA = _folder_path(FOLDER_PROJECT, '_data') +FILE_FEAT = "cpp_features.xlsx" + + +# General Columns and strings +COL_SCALE_ID = "scale_id" +COL_CAT = "category" +COL_SUBCAT = "subcategory" +COL_NAME = "scale_name" +COL_SCALE_DESCRIPTION = "scale_description" +COL_SUBCAT_DESCRIPTION = "subcategory_description" +COL_COUNT = "n_scales" +COL_PROPERTY = "property" +COLS_SCALE_INFOS = [COL_SCALE_ID, COL_CAT, COL_SUBCAT, COL_NAME, COL_SCALE_DESCRIPTION] diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..fc68cdd8 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,209 @@ +""" +This file contains shared fixtures (preloaded data) that can be used by tests. +""" +import pandas as pd +import numpy as np +import pytest + +import aaanalysis as aa +from aaanalysis import SequenceFeature +import tests._utils as ut + + +# Valid functions +@pytest.fixture(scope="module") +def df_seq(): + return aa.load_dataset(name="DOM_GSEC_PU", n=10) + + +@pytest.fixture(scope="module") +def labels(df_seq): + labels = [1 if x == "SUBEXPERT" else 0 for x in df_seq["label"]] + return labels + + +@pytest.fixture(scope="module") +def df_cat(): + df_cat = aa.load_scales(name="scales_cat").head(100) + return df_cat + + +@pytest.fixture(scope="module") +def df_scales(): + df_scales = aa.load_scales() + return df_scales + + +@pytest.fixture(scope="module") +def df_parts(df_seq): + sf = SequenceFeature() + return sf.get_df_parts(df_seq=df_seq) + + +@pytest.fixture(scope="module") +def split_kws(): + sf = SequenceFeature() + return sf.get_split_kws() + + +@pytest.fixture(scope="function") +def df_feat(): + return pd.read_excel(ut.FOLDER_DATA + ut.FILE_FEAT) + + +@pytest.fixture(scope="module") +def df_feat_module_scope(): + return pd.read_excel(ut.FOLDER_DATA + ut.FILE_FEAT) + + +@pytest.fixture(scope="module") +def list_parts(): + list_parts = [["tmd_jmd"], ["tmd"], ["tmd_e"], ["tmd_e", "tmd_c_jmd_c", "jmd_n_tmd_n"], + ["tmd", "tmd_e", "tmd_c_jmd_c", "jmd_n_tmd_n"]] + return list_parts + + +@pytest.fixture(scope="module") +def list_splits(): + list_splits = ["Segment(5,7)", "Segment(1,1)", "Pattern(C,1,2)", "Pattern(N,1)", "Pattern(N,1,4,10)", + "PeriodicPattern(N,i+2/3,1)", "PeriodicPattern(N,i+4/2,5)", "PeriodicPattern(C,i+1/5,1)"] + return list_splits + + +# Wrong +@pytest.fixture(params=[pd.DataFrame(), 2, "s", dict]) +def wrong_df(request): + return request.param + + +# Corrupted input using parametrized fixtures +def _corrupted_list_parts(): + list_parts = [["tmd_md"], ["TMD"], ["tmd_E"], ["md_e", "tmd_c_jmd_n", "jmd_n_tmd_a"], + ["tmd", "tmd_e", "tmd_c_jmd_c", "jmd_c_tmd_n"]] + return list_parts + + +@pytest.fixture(params=_corrupted_list_parts()) +def corrupted_list_parts(request): + return request.param + + +def _corrupted_list_splits(): + list_splits = ["Segment(5,2)", "segment(1,1)", "Pttern(C,1,2)", "Pattern(A,1)", "Pattern(N,25,4,10)", + "PeriodicPattern(N,i2/3,1)", "PeriodicPattern(N,i+4/2)", "Periodicattern(C,i+1/5,1)"] + return list_splits + + +@pytest.fixture(params=_corrupted_list_splits()) +def corrupted_list_splits(request): + return request.param + + +def _corrupted_df_seq(): + df_seq = aa.load_dataset(name="DOM_GSEC_PU", n=10) + dfa = df_seq.drop(["sequence"], axis=1) + df1 = dfa.drop(["tmd"], axis=1) + df2 = dfa.copy() + df2.iloc[:1, df2.columns.get_loc("tmd")] = np.nan + df3 = df2.copy() + df3["tmd"] = 4 + df4 = dfa.copy() + df4["tmd"] = np.nan + dfb = df_seq.drop(["tmd"], axis=1) + df5 = dfb.copy() + df5["sequence"] = 4 + df6 = dfb.copy() + df6["sequence"] = np.nan + return [df1, df2, df3, df4, df5, df6] + + +@pytest.fixture(params=_corrupted_df_seq()) +def corrupted_df_seq(request): + return request.param + + +def _corrupted_df_scales(): + df_scales = aa.load_scales() + scales = list(df_scales) + df1 = df_scales.copy() + df1[scales[0]] = "a" + df2 = pd.concat([df_scales, df_scales], axis=0) + df3 = pd.concat([df_scales, df_scales], axis=1) + df4 = df_scales.copy() + df4[scales[1]] = [np.NaN] + [0.5] * 19 + df5 = df_scales.copy() + df5.reset_index(inplace=True) + df6 = df_scales.copy() + df6.index = ["A"] * 20 + return [df1, df2, df3, df4, df5, df6] + + +@pytest.fixture(params=_corrupted_df_scales()) +def corrupted_df_scales(request): + return request.param + + +def _corrupted_split_kws(): + sf = SequenceFeature() + split_kws = sf.get_split_kws() + kws1 = split_kws.copy() + kws1["test"] = 1 + kws2 = split_kws.copy() + kws2["segment"] = kws2["Segment"] + kws2.pop("Segment") + kws3 = split_kws.copy() + kws3["Pattern"]["steps"] = [-1, 3] + kws4 = split_kws.copy() + kws4["PeriodicPattern"]["steps"] = [0, 0, None] + kws5 = split_kws.copy() + kws5["Segment"]["n_split_min"] = 10 + kws5["Segment"]["n_split_max"] = 5 + return [kws1, kws2, kws3, kws4, kws5] + + +@pytest.fixture(params=_corrupted_split_kws()) +def corrupted_split_kws(request): + return request.param + + +def _corrupted_df_parts(): + df_seq = aa.load_dataset(name="DOM_GSEC") + sf = SequenceFeature() + df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=True) + df1 = pd.concat([df_parts, df_parts], axis=0) + df2 = pd.concat([df_parts, df_parts], axis=1) + df3 = df_parts.copy() + df3["test"] = "AAAAAAAAAAAAAAAAAAAA" + df4 = df_parts.copy() + df4["tmd"] = "AAAAAAAAAAAAAAAAa" + df5 = df_parts.copy() + df5.columns = [x.upper() for x in list(df_parts)] + return [df1, df2, df3, df4, df5, df5] + + +@pytest.fixture(params=_corrupted_df_parts()) +def corrupted_df_parts(request): + return request.param + + +def _corrupted_labels(): + df_seq = aa.load_dataset(name="DOM_GSEC", n=10) + labels = df_seq["label"].to_list() + labels_a = [str(x) for x in labels] + labels_b = [x + 1 for x in labels] + labels_c = labels.copy() + labels_c[0] = np.NaN + labels_d = labels.copy() + labels_d[5] = "a" + labels_e = labels.copy() + labels_e.extend([0, 1, 0]) + labels_f = labels.copy() + labels_f.remove(1) + labels_g = [0] * len(labels) + labels_h = [1] * len(labels) + return [labels_a, labels_b, labels_c, labels_d, labels_e, labels_f, labels_g, labels_h] + + +@pytest.fixture(params=_corrupted_labels()) +def corrupted_labels(request): + return request.param diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/pytest.ini b/tests/pytest.ini new file mode 100644 index 00000000..b4441f07 --- /dev/null +++ b/tests/pytest.ini @@ -0,0 +1,6 @@ +# pytest.ini + +[pytest] +filterwarnings = + ignore::DeprecationWarning +minversion = 6.0 \ No newline at end of file diff --git a/tests/unit/.hypothesis/examples/0338a9f663ab7546/cd6bd1dcfebeffe9 b/tests/unit/.hypothesis/examples/0338a9f663ab7546/cd6bd1dcfebeffe9 new file mode 100644 index 00000000..4227ca4e Binary files /dev/null and b/tests/unit/.hypothesis/examples/0338a9f663ab7546/cd6bd1dcfebeffe9 differ diff --git a/tests/unit/.hypothesis/examples/0b9fbbc4b67e1594/bec021b4f368e306 b/tests/unit/.hypothesis/examples/0b9fbbc4b67e1594/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/0b9fbbc4b67e1594/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/0d49c621ea836a14/bec021b4f368e306 b/tests/unit/.hypothesis/examples/0d49c621ea836a14/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/0d49c621ea836a14/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/2051beb3e8b6fe40/7210af19145ec2a8 b/tests/unit/.hypothesis/examples/2051beb3e8b6fe40/7210af19145ec2a8 new file mode 100644 index 00000000..f66c9cf4 Binary files /dev/null and b/tests/unit/.hypothesis/examples/2051beb3e8b6fe40/7210af19145ec2a8 differ diff --git a/tests/unit/.hypothesis/examples/2051beb3e8b6fe40/bec021b4f368e306 b/tests/unit/.hypothesis/examples/2051beb3e8b6fe40/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/2051beb3e8b6fe40/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/31734d851f1a4e2b/bec021b4f368e306 b/tests/unit/.hypothesis/examples/31734d851f1a4e2b/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/31734d851f1a4e2b/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/3693a99b3ae6f7c9/ad0afad02fa57cb9 b/tests/unit/.hypothesis/examples/3693a99b3ae6f7c9/ad0afad02fa57cb9 new file mode 100644 index 00000000..f8fa5a23 --- /dev/null +++ b/tests/unit/.hypothesis/examples/3693a99b3ae6f7c9/ad0afad02fa57cb9 @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/tests/unit/.hypothesis/examples/3693a99b3ae6f7c9/bec021b4f368e306 b/tests/unit/.hypothesis/examples/3693a99b3ae6f7c9/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/3693a99b3ae6f7c9/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/3927be5e3c3f64a6/bec021b4f368e306 b/tests/unit/.hypothesis/examples/3927be5e3c3f64a6/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/3927be5e3c3f64a6/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/3be25a19924ed0e2/cd6bd1dcfebeffe9 b/tests/unit/.hypothesis/examples/3be25a19924ed0e2/cd6bd1dcfebeffe9 new file mode 100644 index 00000000..4227ca4e Binary files /dev/null and b/tests/unit/.hypothesis/examples/3be25a19924ed0e2/cd6bd1dcfebeffe9 differ diff --git a/tests/unit/.hypothesis/examples/473a3d8204356f77/bec021b4f368e306 b/tests/unit/.hypothesis/examples/473a3d8204356f77/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/473a3d8204356f77/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/522e9e2445800a97/8a65d0e6b9d94717 b/tests/unit/.hypothesis/examples/522e9e2445800a97/8a65d0e6b9d94717 new file mode 100644 index 00000000..1de00ecd --- /dev/null +++ b/tests/unit/.hypothesis/examples/522e9e2445800a97/8a65d0e6b9d94717 @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/tests/unit/.hypothesis/examples/522e9e2445800a97/bec021b4f368e306 b/tests/unit/.hypothesis/examples/522e9e2445800a97/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/522e9e2445800a97/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/5b65fce0c423c045/bec021b4f368e306 b/tests/unit/.hypothesis/examples/5b65fce0c423c045/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/5b65fce0c423c045/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/5ff96874b65ba13a/cd6bd1dcfebeffe9 b/tests/unit/.hypothesis/examples/5ff96874b65ba13a/cd6bd1dcfebeffe9 new file mode 100644 index 00000000..4227ca4e Binary files /dev/null and b/tests/unit/.hypothesis/examples/5ff96874b65ba13a/cd6bd1dcfebeffe9 differ diff --git a/tests/unit/.hypothesis/examples/62540fab9eff6bc0/bec021b4f368e306 b/tests/unit/.hypothesis/examples/62540fab9eff6bc0/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/62540fab9eff6bc0/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/6bbd04bcbcf9b4b9/bec021b4f368e306 b/tests/unit/.hypothesis/examples/6bbd04bcbcf9b4b9/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/6bbd04bcbcf9b4b9/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/711334a0a54002bb/cd6bd1dcfebeffe9 b/tests/unit/.hypothesis/examples/711334a0a54002bb/cd6bd1dcfebeffe9 new file mode 100644 index 00000000..4227ca4e Binary files /dev/null and b/tests/unit/.hypothesis/examples/711334a0a54002bb/cd6bd1dcfebeffe9 differ diff --git a/tests/unit/.hypothesis/examples/7124a863f1536def/bec021b4f368e306 b/tests/unit/.hypothesis/examples/7124a863f1536def/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/7124a863f1536def/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/7ac0e969251cf182/bec021b4f368e306 b/tests/unit/.hypothesis/examples/7ac0e969251cf182/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/7ac0e969251cf182/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/81b963090f45ad74/bec021b4f368e306 b/tests/unit/.hypothesis/examples/81b963090f45ad74/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/81b963090f45ad74/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/8301a69e314dc668/0bd1deb1f5e4e394 b/tests/unit/.hypothesis/examples/8301a69e314dc668/0bd1deb1f5e4e394 new file mode 100644 index 00000000..8214d0ee --- /dev/null +++ b/tests/unit/.hypothesis/examples/8301a69e314dc668/0bd1deb1f5e4e394 @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/tests/unit/.hypothesis/examples/8301a69e314dc668/b16ae799954fe41e b/tests/unit/.hypothesis/examples/8301a69e314dc668/b16ae799954fe41e new file mode 100644 index 00000000..147efaa6 --- /dev/null +++ b/tests/unit/.hypothesis/examples/8301a69e314dc668/b16ae799954fe41e @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/tests/unit/.hypothesis/examples/8301a69e314dc668/e9d93fbaf0a1b2a4 b/tests/unit/.hypothesis/examples/8301a69e314dc668/e9d93fbaf0a1b2a4 new file mode 100644 index 00000000..c96ab3cc --- /dev/null +++ b/tests/unit/.hypothesis/examples/8301a69e314dc668/e9d93fbaf0a1b2a4 @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/tests/unit/.hypothesis/examples/8822ba4b2c91a43c/cd6bd1dcfebeffe9 b/tests/unit/.hypothesis/examples/8822ba4b2c91a43c/cd6bd1dcfebeffe9 new file mode 100644 index 00000000..4227ca4e Binary files /dev/null and b/tests/unit/.hypothesis/examples/8822ba4b2c91a43c/cd6bd1dcfebeffe9 differ diff --git a/tests/unit/.hypothesis/examples/8c42ed866132948e/bec021b4f368e306 b/tests/unit/.hypothesis/examples/8c42ed866132948e/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/8c42ed866132948e/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/92f6d0b72a3b7ae6/bec021b4f368e306 b/tests/unit/.hypothesis/examples/92f6d0b72a3b7ae6/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/92f6d0b72a3b7ae6/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/a56b7aa70695021d/bec021b4f368e306 b/tests/unit/.hypothesis/examples/a56b7aa70695021d/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/a56b7aa70695021d/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/a60ad3cb3c7faff3/bec021b4f368e306 b/tests/unit/.hypothesis/examples/a60ad3cb3c7faff3/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/a60ad3cb3c7faff3/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/a60ad3cb3c7faff3/dbadd266fff9f046 b/tests/unit/.hypothesis/examples/a60ad3cb3c7faff3/dbadd266fff9f046 new file mode 100644 index 00000000..9280c0d3 --- /dev/null +++ b/tests/unit/.hypothesis/examples/a60ad3cb3c7faff3/dbadd266fff9f046 @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/tests/unit/.hypothesis/examples/b052e35ac20e0ec5/bec021b4f368e306 b/tests/unit/.hypothesis/examples/b052e35ac20e0ec5/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/b052e35ac20e0ec5/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/b49dc780abd56132/cd6bd1dcfebeffe9 b/tests/unit/.hypothesis/examples/b49dc780abd56132/cd6bd1dcfebeffe9 new file mode 100644 index 00000000..4227ca4e Binary files /dev/null and b/tests/unit/.hypothesis/examples/b49dc780abd56132/cd6bd1dcfebeffe9 differ diff --git a/tests/unit/.hypothesis/examples/c4e8db3eac978a66/7210af19145ec2a8 b/tests/unit/.hypothesis/examples/c4e8db3eac978a66/7210af19145ec2a8 new file mode 100644 index 00000000..f66c9cf4 Binary files /dev/null and b/tests/unit/.hypothesis/examples/c4e8db3eac978a66/7210af19145ec2a8 differ diff --git a/tests/unit/.hypothesis/examples/c4e8db3eac978a66/bec021b4f368e306 b/tests/unit/.hypothesis/examples/c4e8db3eac978a66/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/c4e8db3eac978a66/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/c862f96f1db540a5/bec021b4f368e306 b/tests/unit/.hypothesis/examples/c862f96f1db540a5/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/c862f96f1db540a5/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/ca3c5f0a6e532da0/ad0afad02fa57cb9 b/tests/unit/.hypothesis/examples/ca3c5f0a6e532da0/ad0afad02fa57cb9 new file mode 100644 index 00000000..f8fa5a23 --- /dev/null +++ b/tests/unit/.hypothesis/examples/ca3c5f0a6e532da0/ad0afad02fa57cb9 @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/tests/unit/.hypothesis/examples/cd014e4eb63571d0/bec021b4f368e306 b/tests/unit/.hypothesis/examples/cd014e4eb63571d0/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/cd014e4eb63571d0/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/e7884a84aa0309aa/bec021b4f368e306 b/tests/unit/.hypothesis/examples/e7884a84aa0309aa/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/e7884a84aa0309aa/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/ee49e7dd0cc2ebc2/bec021b4f368e306 b/tests/unit/.hypothesis/examples/ee49e7dd0cc2ebc2/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/ee49e7dd0cc2ebc2/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/f29ca4420287aed1/bec021b4f368e306 b/tests/unit/.hypothesis/examples/f29ca4420287aed1/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/f29ca4420287aed1/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/examples/f534cb803dee1cc6/cd6bd1dcfebeffe9 b/tests/unit/.hypothesis/examples/f534cb803dee1cc6/cd6bd1dcfebeffe9 new file mode 100644 index 00000000..4227ca4e Binary files /dev/null and b/tests/unit/.hypothesis/examples/f534cb803dee1cc6/cd6bd1dcfebeffe9 differ diff --git a/tests/unit/.hypothesis/examples/f58941429ec6524d/bec021b4f368e306 b/tests/unit/.hypothesis/examples/f58941429ec6524d/bec021b4f368e306 new file mode 100644 index 00000000..f76dd238 Binary files /dev/null and b/tests/unit/.hypothesis/examples/f58941429ec6524d/bec021b4f368e306 differ diff --git a/tests/unit/.hypothesis/unicode_data/13.0.0/charmap.json.gz b/tests/unit/.hypothesis/unicode_data/13.0.0/charmap.json.gz new file mode 100644 index 00000000..3cb2f83e Binary files /dev/null and b/tests/unit/.hypothesis/unicode_data/13.0.0/charmap.json.gz differ diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/test_aaclust.py b/tests/unit/test_aaclust.py new file mode 100644 index 00000000..ab657cb7 --- /dev/null +++ b/tests/unit/test_aaclust.py @@ -0,0 +1,90 @@ +""" +This is a script for ... +""" +import time +import pandas as pd +import numpy as np +from sklearn.cluster import AgglomerativeClustering, KMeans + +import aaanalysis as aa +from aaanalysis.aaclust.aaclust import get_min_cor, estimate_lower_bound_n_clusters, \ + optimize_n_clusters, merge_clusters, AAclust + +import tests._utils as ut + +# Settings +pd.set_option('expand_frame_repr', False) # Single line print for pd.Dataframe + +# TODO change to proper test (CPP) + + +# I Helper Functions +def get_feat_matrix(df_cat=None, df_scales=None, unclassified_in=True, return_col=ut.COL_SCALE_ID, cat=None): + """""" + if cat is not None: + df_cat = df_cat[df_cat[ut.COL_CAT] == cat] + if unclassified_in: + scales = df_cat[return_col].to_list() + else: + mask = (~df_cat[ut.COL_SUBCAT].str.contains("Unclassified")) & (df_cat[ut.COL_CAT] != "Others") + df_cat = df_cat[mask] + scales = df_cat[ut.COL_SCALE_ID].to_list() + X = np.array(df_scales[scales]).T + labels = list(df_cat[return_col]) + return X, labels + + +def get_data(): + """""" + df_cat = aa.load_scales(name="scales_cat") + df_scales = aa.load_scales(name="scales") + X, scales = get_feat_matrix(df_cat=df_cat.copy(), + df_scales=df_scales.copy(), + unclassified_in=True) + return X + + +def get_model(): + """""" + model_kwargs=dict() + model = AgglomerativeClustering + return model, model_kwargs + + +# II Main Functions +def test_steps(): + """""" + X = get_data() + model, model_kwargs = get_model() + args = dict(X=X, model=model, model_kwargs=model_kwargs, min_th=0.3, on_center=False) + k = estimate_lower_bound_n_clusters(**args) + k = optimize_n_clusters(**args, n_clusters=k) + labels = model(n_clusters=k, **model_kwargs).fit(X).labels_.tolist() + print(len(set(labels))) + labels_ = merge_clusters(X, labels=labels, min_th=0.3, on_center=False) + print(len(set(labels_))) + print(get_min_cor(X, labels=labels_, on_center=False)) + + +def test_aaclust(): + """""" + X = get_data() + model, model_kwargs = get_model() + aac = AAclust(model=model, model_kwargs=model_kwargs) + args = dict(on_center=False, min_th=0.3, merge=True, merge_metric="euclidean") + aac.fit(X, **args) + + +# III Test/Caller Functions + + +# IV Main +def main(): + t0 = time.time() + test_steps() + t1 = time.time() + print("Time:", t1 - t0) + + +if __name__ == "__main__": + main() diff --git a/tests/unit/test_cpp.py b/tests/unit/test_cpp.py new file mode 100644 index 00000000..d0c71853 --- /dev/null +++ b/tests/unit/test_cpp.py @@ -0,0 +1,625 @@ +""" +This is a script for Unit tests of the CPP class +""" +import pandas as pd +import numpy as np +import pytest +from hypothesis import given, settings +import hypothesis.strategies as some +import matplotlib as mpl +import matplotlib.pyplot as plt + +import tests._utils as ut +import aaanalysis as aa + + +@pytest.fixture(params=["a", 3, dict(), list(), pd.DataFrame(), -0]) +def wrong_input_cpp(request): + return request.param + + +@pytest.fixture(params=["a", dict(), list(), pd.DataFrame()]) +def wrong_input(request): + return request.param + + +@pytest.fixture +def cpp(df_scales, df_cat, df_parts, split_kws): + return aa.CPP(df_scales=df_scales, df_cat=df_cat, df_parts=df_parts, split_kws=split_kws) + + +# I Unit Tests +class TestCPP: + """Test CPP class interface""" + + # Positive unit test + def test_cpp_call(self, df_scales, df_cat, df_parts, split_kws): + cpp = aa.CPP(df_scales=df_scales, df_cat=df_cat, + df_parts=df_parts, split_kws=split_kws) + assert isinstance(cpp, object) + cpp = aa.CPP(df_parts=df_parts, ) + assert isinstance(cpp, object) + + # Negative unit test + def test_missing_input(self, df_scales, df_cat, df_parts, split_kws): + with pytest.raises(ValueError): + aa.CPP() + + def test_wrong_df_scales(self, wrong_input_cpp, df_cat, df_parts, split_kws): + with pytest.raises(ValueError): + aa.CPP(df_scales=wrong_input_cpp, df_cat=df_cat, df_parts=df_parts, split_kws=split_kws) + + def test_wrong_df_cat(self, df_scales, wrong_input_cpp, df_parts, split_kws): + with pytest.raises(ValueError): + aa.CPP(df_scales=df_scales, df_cat=wrong_input_cpp, df_parts=df_parts, split_kws=split_kws) + + def test_wrong_df_parts(self, df_scales, df_cat, wrong_input_cpp, split_kws): + with pytest.raises(ValueError): + aa.CPP(df_scales=df_scales, df_cat=df_cat, df_parts=wrong_input_cpp, split_kws=split_kws) + + def test_wrong_split_kws(self, df_scales, df_cat, df_parts, wrong_input_cpp): + with pytest.raises(ValueError): + aa.CPP(df_scales=df_scales, df_cat=df_cat, df_parts=df_parts, split_kws=wrong_input_cpp) + + +class TestAddStat: + """Test adding statistics of features to DataFrame""" + + # Positive unit tests + def test_add_stat(self, cpp, df_feat, labels): + assert isinstance(cpp.add_stat(df_feat=df_feat, labels=labels, parametric=True), pd.DataFrame) + assert isinstance(cpp.add_stat(df_feat=df_feat, labels=labels, parametric=False), pd.DataFrame) + df_feat = df_feat[["feature"]] + assert isinstance(cpp.add_stat(df_feat=df_feat, labels=labels, parametric=True), pd.DataFrame) + assert isinstance(cpp.add_stat(df_feat=df_feat, labels=labels, parametric=False), pd.DataFrame) + + # Negative unit tests + def test_wrong_df_feat(self, cpp, labels, wrong_df): + with pytest.raises(ValueError): + cpp.add_stat(df_feat=wrong_df, labels=labels) + + def test_corrupted_labels(self, cpp, corrupted_labels, df_feat): + with pytest.raises(ValueError): + cpp.add_stat(df_feat=df_feat, labels=corrupted_labels) + + +class TestAddPositions: + """Test add_positions method""" + + # Positive unit tests + def test_add_positions(self, df_feat, cpp): + df_feat = cpp._add_positions(df_feat=df_feat, tmd_len=30) + assert isinstance(df_feat, pd.DataFrame) + assert "positions" in list(df_feat) + + # Property based testing + @given(tmd_len=some.integers(min_value=15, max_value=100), + jmd_n_len=some.integers(min_value=5, max_value=20), + jmd_c_len=some.integers(min_value=5, max_value=20), + ext_len=some.integers(min_value=1, max_value=4), + start=some.integers(min_value=0, max_value=50)) + @settings(max_examples=10, deadline=None) + def test_add_position_tmd_len(self, df_feat_module_scope, df_parts, tmd_len, jmd_n_len, jmd_c_len, ext_len, start): + cpp = aa.CPP(df_parts=df_parts) + df_feat = cpp._add_positions(df_feat=df_feat_module_scope, tmd_len=tmd_len, jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len, + ext_len=ext_len, start=start) + assert isinstance(df_feat, pd.DataFrame) + + # Negative unit tests + def test_wrong_tmd_len(self, df_feat, cpp, wrong_input): + with pytest.raises(ValueError): + cpp._add_positions(df_feat=df_feat, tmd_len=wrong_input) + + def test_wrong_jmd_len(self, df_feat, cpp, wrong_input): + with pytest.raises(ValueError): + cpp._add_positions(df_feat=df_feat, jmd_n_len=wrong_input) + with pytest.raises(ValueError): + cpp._add_positions(df_feat=df_feat, jmd_c_len=wrong_input) + + def test_wrong_ext_len(self, df_feat, cpp, wrong_input): + with pytest.raises(ValueError): + cpp._add_positions(df_feat=df_feat, ext_len=wrong_input) + # ext_len >= jmd_n_len or jmd_c_len + with pytest.raises(ValueError): + cpp._add_positions(df_feat=df_feat, ext_len=5, jmd_n_len=3) + with pytest.raises(ValueError): + cpp._add_positions(df_feat=df_feat, ext_len=5, jmd_c_len=3) + + def test_wrong_start(self, df_feat, cpp, wrong_input): + with pytest.raises(ValueError): + cpp._add_positions(df_feat=df_feat, start=wrong_input) + with pytest.raises(ValueError): + cpp._add_positions(df_feat=df_feat, start=-4) + + +class TestAddScaleCategory: + """Test add_positions method""" + + # Positive unit tests + def test_add_scale_category(self, df_feat, cpp): + assert df_feat.equals(cpp.add_scale_info(df_feat=df_feat)) + df_no_cat = df_feat.drop([ut.COL_CAT, ut.COL_SUBCAT], axis=1) + df_with_cat = cpp.add_scale_info(df_feat=df_no_cat) + assert df_feat.equals(df_with_cat) + + # Negative unit tests + def test_wrong_input(self, cpp, wrong_input): + with pytest.raises(ValueError): + cpp.add_scale_info(df_feat=wrong_input) + + def test_missing_feature(self, cpp, df_feat): + df_no_cat = df_feat.drop([ut.COL_CAT, ut.COL_SUBCAT, ut.COL_FEATURE], axis=1) + with pytest.raises(ValueError): + cpp.add_scale_info(df_feat=df_no_cat) + + +class TestAddFeatureImpact: + """Test adding feature impact to feature DataFrame""" + + # Positive unit tests + def test_add_feat_impact(self, cpp, df_feat, df_parts, df_scales, labels): + from sklearn.ensemble import RandomForestClassifier + import shap + sf = aa.SequenceFeature() + X = sf.feat_matrix(features=list(df_feat["feature"]), df_parts=df_parts, df_scales=df_scales) + assert isinstance(X, np.ndarray) + model = RandomForestClassifier().fit(X=X, y=labels) + # compute SHAP values + explainer = shap.TreeExplainer(model) + shap_values = explainer.shap_values(X, y=labels) + df_feat["shap_value"] = shap_values[1][0] + df_feat = cpp.add_shap(df_feat=df_feat) + assert isinstance(df_feat, pd.DataFrame) + df_feat = cpp.add_shap(df_feat=df_feat, name_feat_impact="Test") + assert isinstance(df_feat, pd.DataFrame) + df_feat["shap_value"] = [0.4] * (len(df_feat) - 1) + [np.NaN] + df_feat = cpp.add_shap(df_feat=df_feat, name_feat_impact="test") + assert isinstance(df_feat, pd.DataFrame) + + # Negative unit tests + def test_wrong_shap_value(self, cpp, df_feat): + with pytest.raises(ValueError): + df_feat = cpp.add_shap(df_feat=df_feat) + df_feat["shap_value"] = "wrong" + with pytest.raises(ValueError): + df_feat = cpp.add_shap(df_feat=df_feat) + + +class TestAddSampleDif: + """Test adding differences of sample and reference mean to feature DataFrame""" + + # Positive unit tests + def test_add_sample(self, df_feat, df_seq, labels, cpp): + list_names = list(df_seq[ut.COL_NAME])[0:2] + ref_group = 0 + # Test all names + for prot_name in list_names: + df_feat = cpp.add_sample_dif(df_feat=df_feat, df_seq=df_seq, labels=labels, + sample_name=prot_name, ref_group=ref_group) + assert isinstance(df_feat, pd.DataFrame) + + # Negative unit tests + def test_wrong_input(self, df_feat, df_seq, labels, cpp): + args = dict(df_feat=df_feat, df_seq=df_seq, labels=labels) + name = "A4_HUMAN" + ref_group = 0 + with pytest.raises(ValueError): + cpp.add_sample_dif(**args, sample_name=name.lower(), ref_group=ref_group) + with pytest.raises(ValueError): + cpp.add_sample_dif(**args, sample_name=1, ref_group=ref_group) + with pytest.raises(ValueError): + cpp.add_sample_dif(**args, sample_name=name, ref_group=5) + with pytest.raises(ValueError): + cpp.add_sample_dif(**args, sample_name=name, ref_group=[0, 1]) + + def test_corrupted_df_seq(self, df_feat, wrong_df, labels, cpp): + name = "A4_HUMAN" + ref_group = 0 + with pytest.raises(ValueError): + cpp.add_sample_dif(df_feat=df_feat, df_seq=wrong_df, + labels=labels, sample_name=name, ref_group=ref_group) + + def test_corrupted_labels(self, df_feat, df_seq, corrupted_labels, cpp): + name = "A4_HUMAN" + ref_group = 0 + with pytest.raises(ValueError): + cpp.add_sample_dif(df_feat=df_feat, df_seq=df_seq, + labels=corrupted_labels, sample_name=name, ref_group=ref_group) + + +class TestRun: + """Test add_positions method""" + + # Positive unit tests + def test_cpp_run(self): + sf = aa.SequenceFeature() + df_seq = sf.load_sequences(n_in_class=2) + labels = [1 if x == "SUBEXPERT" else 0 for x in df_seq["class"]] + df_parts = sf.get_df_parts(df_seq=df_seq) + df_cat = sf.load_categories() + df_scales = sf.load_scales() + list_scales = list(df_scales)[0:2] + df_scales = df_scales[list_scales] + cpp = aa.CPP(df_parts=df_parts, df_scales=df_scales, df_cat=df_cat) + args = dict(verbose=False, labels=labels) + assert isinstance(cpp.run(**args), pd.DataFrame) + """ + assert isinstance(cpp.run(parametric=True, **args), pd.DataFrame) + assert isinstance(cpp.run(n_filter=1000, **args), pd.DataFrame) + assert isinstance(cpp.run(n_pre_filter=1000, **args), pd.DataFrame) + assert isinstance(cpp.run(accept_gaps=True, **args), pd.DataFrame) + assert isinstance(cpp.run(pct_pre_filter=20, **args), pd.DataFrame) + """ + + # Negative unit tests + def test_corrupted_labels(self, cpp, corrupted_labels): + with pytest.raises(ValueError): + cpp.run(verbose=False, labels=corrupted_labels) + + def test_wrong_n_filter(self, cpp, labels): + for n in ["a", -3, list(), np.NaN]: + with pytest.raises(ValueError): + cpp.run(verbose=False, labels=labels, n_filter=n) + # Should be non-negative int > 1 and not None + for n in [-1, 0, -100, 0.5, None]: + with pytest.raises(ValueError): + cpp.run(verbose=False, labels=labels, n_filter=n) + + def test_wrong_n_pre_filter(self, cpp, labels): + for n in ["a", -3, list(), np.NaN]: + with pytest.raises(ValueError): + cpp.run(verbose=False, labels=labels, n_pre_filter=n) + # Should be non-negative int > 1 (None accepted) + for n in [-1, 0, -100, 0.5]: + with pytest.raises(ValueError): + cpp.run(verbose=False, labels=labels, n_pre_filter=n) + + def test_wrong_pct_pre_filter(self, cpp, labels): + for n in ["a", -3, list(), np.NaN]: + with pytest.raises(ValueError): + cpp.run(verbose=False, labels=labels, pct_pre_filter=n) + # Should be non-negative int >= 5 and not None + for n in [-1, 0, -100, 0.5, 4, 3, None]: + with pytest.raises(ValueError): + cpp.run(verbose=False, labels=labels, pct_pre_filter=n) + + def test_wrong_max_std(self, cpp, labels): + for n in ["a", -3, list(), np.NaN]: + with pytest.raises(ValueError): + cpp.run(verbose=False, labels=labels, max_std_test=n) + # Should be non-negative int >= 5 and not None + for n in [-1, 100, -100, 4, 3, None]: + with pytest.raises(ValueError): + cpp.run(verbose=False, labels=labels, max_std_test=n) + +""" +class TestGetDfPos: + # Positive unit tests + def test_get_df_pos(self, df_feat, df_cat): + df_pos = _get_df_pos(df_feat=df_feat, df_cat=df_cat) + assert isinstance(df_pos, pd.DataFrame) + for i in ["count", "mean", "sum", "std"]: + assert isinstance(_get_df_pos(df_feat=df_feat, df_cat=df_cat, value_type=i), pd.DataFrame) + + # Property based testing + @given(tmd_len=some.integers(min_value=15, max_value=100), + jmd_n_len=some.integers(min_value=5, max_value=20), + jmd_c_len=some.integers(min_value=5, max_value=20), + start=some.integers(min_value=0, max_value=50)) + @settings(max_examples=10, deadline=None) + def test_get_df_pos_len(self, df_feat_module_scope, df_cat, tmd_len, jmd_n_len, jmd_c_len, start): + df_pos = _get_df_pos(df_feat=df_feat_module_scope, df_cat=df_cat, + tmd_len=tmd_len, jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len, + start=start) + assert isinstance(df_pos, pd.DataFrame) + + # Negative unit tests + def test_wrong_value_type(self, df_feat, df_cat): + with pytest.raises(ValueError): + _get_df_pos(df_feat=df_feat, df_cat=df_cat, tmd_len=wrong_input) + + def test_wrong_tmd_len(self, df_feat, df_cat, wrong_input): + with pytest.raises(ValueError): + _get_df_pos(df_feat=df_feat, df_cat=df_cat, tmd_len=wrong_input) + + def test_wrong_jmd_len(self, df_feat, df_cat, wrong_input): + with pytest.raises(ValueError): + _get_df_pos(df_feat=df_feat, df_cat=df_cat, jmd_n_len=wrong_input) + with pytest.raises(ValueError): + _get_df_pos(df_feat=df_feat, df_cat=df_cat, jmd_c_len=wrong_input) + + def test_wrong_start(self, df_feat, df_cat, wrong_input): + with pytest.raises(ValueError): + _get_df_pos(df_feat=df_feat, start=wrong_input) + with pytest.raises(ValueError): + _get_df_pos(df_feat=df_feat, df_cat=df_cat, start=-4) + + def test_wrong_normalize(self, df_feat, df_cat, wrong_input): + with pytest.raises(ValueError): + _get_df_pos(df_feat=df_feat, df_cat=df_cat, normalize=wrong_input) + + def test_wrong_value_type(self, df_feat, df_cat, wrong_input): + with pytest.raises(ValueError): + _get_df_pos(df_feat=df_feat, df_cat=df_cat, value_type=wrong_input) + + def test_wrong_value_col(self, df_feat, df_cat, wrong_input): + with pytest.raises(ValueError): + _get_df_pos(df_feat=df_feat, df_cat=df_cat, val_col=wrong_input) + + def test_wrong_y(self, df_feat, df_cat, wrong_input): + with pytest.raises(ValueError): + _get_df_pos(df_feat=df_feat, df_cat=df_cat, y=wrong_input) +""" + +class TestPlotMethods: + """General test for plotting methods (using heatmap)""" + + # Positive & Negative unit tests + def test_df_feat(self, df_feat, cpp): + """Positive unit Test main arguments: df_feat, y, val_col, value_type, normalize""" + assert isinstance(cpp.plot_heatmap(df_feat=df_feat), mpl.axes.Axes) + for y in ["category", "subcategory", "scale_name"]: + assert isinstance(cpp.plot_heatmap(df_feat=df_feat, y=y), mpl.axes.Axes) + for val_col in ["abs_auc", "abs_mean_dif", "mean_dif", "std_test", "p_val_fdr_bh"]: + assert isinstance(cpp.plot_heatmap(df_feat=df_feat, val_col=val_col), mpl.axes.Axes) + for val_type in ["sum", "mean", "std"]: + for normalize in [True, False, "positions"]: + assert isinstance(cpp.plot_heatmap(df_feat=df_feat, val_type=val_type, normalize=normalize), + mpl.axes.Axes) + + def test_wrong_df_feat(self, df_feat, cpp): + for y in ["categorY", "sub__category", "Scale", "feature", 1, list, "abs_mean_dif"]: + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, y=y) + for val_col in ["subcategory", "Abs_mean_dif", "p_val", 1, list]: + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, val_col=val_col) + for val_type in ["SUM", "man", 1, 2]: + for normalize in ["positions", True]: + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, val_type=val_type, normalize=normalize) + + def test_plotting(self, df_feat, df_parts): + """Test main plotting arguments: figsize, title, title_kws""" + cpp = aa.CPP(df_parts=df_parts) + # Figsize and title checked by matplotlib + title_kws = {'fontsize': 11, + 'fontweight': "bold", + 'verticalalignment': 'baseline', + 'horizontalalignment': "center"} + assert isinstance(cpp.plot_heatmap(df_feat=df_feat, title="Test", title_kws=title_kws) + , mpl.axes.Axes) + + def test_figsize(self, df_feat, df_parts): + """Test figsize""" + cpp = aa.CPP(df_parts=df_parts) + args = dict(df_feat=df_feat, figsize=(10, 5)) + assert isinstance(cpp.plot_heatmap(**args), mpl.axes.Axes) + assert isinstance(cpp.plot_bargraph(**args), mpl.axes.Axes) + assert isinstance(cpp.plot_profile(**args), mpl.axes.Axes) + + def test_wrong_figsize(self, df_feat, df_parts): + """Test wrong figsize""" + cpp = aa.CPP(df_parts=df_parts) + for figsize in [(0, 10), "a", [1, 2], (10, "a")]: + args = dict(df_feat=df_feat, figsize=figsize) + with pytest.raises(ValueError): + cpp.plot_heatmap(**args) + with pytest.raises(ValueError): + cpp.plot_bargraph(**args) + with pytest.raises(ValueError): + cpp.plot_profile(**args) + + def test_dict_color(self, df_feat, df_parts): + cpp = aa.CPP(df_parts=df_parts) + sf = aa.SequenceFeature() + dict_color = sf.load_colors() + assert isinstance(cpp.plot_heatmap(df_feat=df_feat, dict_color=dict_color) + , mpl.axes.Axes) + + def test_wrong_dict_color(self, df_feat, df_parts): + cpp = aa.CPP(df_parts=df_parts) + sf = aa.SequenceFeature() + dict_color = sf.load_colors() + for i in [1, dict(), "asdf", 0.1]: + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, dict_color=i) + dict_color["Composition"] = 1 + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, dict_color=dict_color) + dict_color = {"Composition": "blue"} + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, dict_color=dict_color) + + def test_sequences(self, df_feat, df_parts): + """Test sequence input: tmd_seq, jmd_n_seq, jmd_c_seq""" + # Length input tested in TestGetDfPos + jmd_c_seq = "AAAAAAAAAAa" + jmd_n_seq = "aa"*10 + cpp = aa.CPP(df_parts=df_parts, jmd_n_len=len(jmd_n_seq), jmd_c_len=len(jmd_c_seq)) + assert isinstance(cpp.plot_heatmap(df_feat=df_feat, + tmd_seq="AA"*16, jmd_c_seq=jmd_c_seq, jmd_n_seq=jmd_n_seq) + , mpl.axes.Axes) + + def test_wrong_sequences(self, df_feat, df_parts): + """Test sequence input: tmd_seq, jmd_n_seq, jmd_c_seq""" + # Length input tested in TestGetDfPos + cpp = aa.CPP(df_parts=df_parts) + wrong_seq = [1, None, list, dict] + tmd_seq = "A" * 20 + jmd_c_seq = "B" * 10 + jmd_n_seq = "C" * 10 + for w in wrong_seq: + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, tmd_seq=w, jmd_c_seq=jmd_c_seq, jmd_n_seq=jmd_n_seq) + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, tmd_seq=tmd_seq, jmd_c_seq=w, jmd_n_seq=jmd_n_seq) + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, tmd_seq=tmd_seq, jmd_c_seq=jmd_c_seq, jmd_n_seq=w) + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, tmd_seq=tmd_seq, jmd_c_seq=jmd_c_seq) + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, tmd_seq=tmd_seq, jmd_n_seq=jmd_n_seq) + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, jmd_c_seq=jmd_c_seq, jmd_n_seq=jmd_n_seq) + jmd_c_seq = "AAAAAAAAAAa" + jmd_n_seq = "aa"*10 + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, jmd_c_seq=jmd_c_seq, jmd_n_seq=jmd_n_seq) + + def test_size(self, df_feat, df_parts): + """Test size input: seq_size, tmd_fontsize, jmd_fontsize""" + cpp = aa.CPP(df_parts=df_parts) + assert isinstance(cpp.plot_heatmap(df_feat=df_feat, + tmd_seq=11, jmd_fontsize=12, tmd_fontsize=11) + , mpl.axes.Axes) + # Simple check function -> No negative test + + def test_color(self, df_feat, df_parts): + """Test color input: tmd_color, jmd_color, tmd_seq_color, jmd_seq_color""" + cpp = aa.CPP(df_parts=df_parts) + args = dict(df_feat=df_feat, tmd_color="b") + assert isinstance(cpp.plot_heatmap(**args), mpl.axes.Axes) + assert isinstance(cpp.plot_heatmap(**args), mpl.axes.Axes) + args = dict(df_feat=df_feat, jmd_seq_color="b") + assert isinstance(cpp.plot_heatmap(**args), mpl.axes.Axes) + assert isinstance(cpp.plot_heatmap(**args), mpl.axes.Axes) + # Simple check function -> No negative test + + def test_ticks(self, df_feat, df_parts): + """Test xtick input: xtick_size, xtick_width, xtick_length""" + cpp = aa.CPP(df_parts=df_parts) + assert isinstance(cpp.plot_heatmap(df_feat=df_feat, xtick_size=11, xtick_width=2, + xtick_length=5, ytick_size=11) + , mpl.axes.Axes) + # Simple check function -> No negative test + + def test_legend(self, df_feat, df_parts): + """Test legend args for heatmap and profile: add_legend_cat, legend_kws""" + cpp = aa.CPP(df_parts=df_parts) + assert isinstance(cpp.plot_heatmap(df_feat=df_feat, legend_kws=dict(fontsize=11)) + , mpl.axes.Axes) + # Simple check function -> No negative test + + +class TestPlotHeatmap: + """Test additional interface of heatmap""" + + # Positive and negative unit tests + def test_vmin_vmax(self, df_feat, df_parts): + cpp = aa.CPP(df_parts=df_parts) + for vmin, vmax in zip([-10, -5, 1, 2, 0.1], [0, 10, 2, 3, 0.2]): + args = dict(df_feat=df_feat, vmin=vmin, vmax=vmax) + assert isinstance(cpp.plot_heatmap(**args), mpl.axes.Axes) + + def test_wrong_vmin_vmax(self, df_feat, cpp): + for vmin, vmax in zip([1, "a", -10, 2], [0, 1, -100, "2"]): + with pytest.raises(ValueError): + cpp.plot_heatmap(df_feat=df_feat, vmin=vmin, vmax=vmax) + + +class TestPlotGraphProfile: + """Test additional interface of bargraph""" + + # Positive and negative unit tests + def test_color(self, df_feat, df_parts): + cpp = aa.CPP(df_parts=df_parts) + args = dict(df_feat=df_feat, bar_color="r", edge_color="b") + assert isinstance(cpp.plot_bargraph(**args), mpl.axes.Axes) + args = dict(df_feat=df_feat, edge_color="b") + assert isinstance(cpp.plot_profile(**args), mpl.axes.Axes) + + def test_wrong_color(self, df_feat, df_parts): + cpp = aa.CPP(df_parts=df_parts) + args = dict(df_feat=df_feat, bar_color="a", edge_color=1) + with pytest.raises(ValueError): + cpp.plot_bargraph(**args) + args = dict(df_feat=df_feat, edge_color=1) + with pytest.raises(ValueError): + cpp.plot_profile(**args) + + def test_ylim(self, df_feat, df_parts): + cpp = aa.CPP(df_parts=df_parts) + args = dict(df_feat=df_feat, ylim=(0, 100)) + assert isinstance(cpp.plot_bargraph(**args), mpl.axes.Axes) + assert isinstance(cpp.plot_profile(**args), mpl.axes.Axes) + + def test_wrong_ylim(self, df_feat, df_parts): + cpp = aa.CPP(df_parts=df_parts) + for ylim in [1, "a", [1, 40], (0), (0, 2), (-10, "a")]: + args = dict(df_feat=df_feat, ylim=ylim) + with pytest.raises(ValueError): + cpp.plot_bargraph(**args) + with pytest.raises(ValueError): + cpp.plot_profile(**args) + + def test_highlight_alpha(self, df_feat, df_parts): + cpp = aa.CPP(df_parts=df_parts) + args = dict(df_feat=df_feat, highlight_alpha=0.5) + assert isinstance(cpp.plot_bargraph(**args), mpl.axes.Axes) + assert isinstance(cpp.plot_profile(**args), mpl.axes.Axes) + + def test_wrong_highlight_alpha(self, df_feat, df_parts): + cpp = aa.CPP(df_parts=df_parts) + for i in ["a", 10, list]: + args = dict(df_feat=df_feat, highlight_alpha=i) + with pytest.raises(ValueError): + cpp.plot_bargraph(**args) + with pytest.raises(ValueError): + cpp.plot_profile(**args) + + def test_grid_axis(self, df_feat, df_parts): + cpp = aa.CPP(df_parts=df_parts) + for grid_axis in ["x", "y", "both"]: + args = dict(df_feat=df_feat, grid_axis=grid_axis) + assert isinstance(cpp.plot_bargraph(**args), mpl.axes.Axes) + assert isinstance(cpp.plot_profile(**args), mpl.axes.Axes) + + def test_wrong_grid_axis(self, df_feat, df_parts): + cpp = aa.CPP(df_parts=df_parts) + for grid_axis in ["X", 1, None, list, "XY"]: + args = dict(df_feat=df_feat, grid_axis=grid_axis) + with pytest.raises(ValueError): + cpp.plot_bargraph(**args) + with pytest.raises(ValueError): + cpp.plot_profile(**args) + + +class TestPlotStat: + """Test additional interface of stat plot""" + + # Positive unit tests + + # Negative unit tests + + +# II Regression/Functional test +def test_add_pipeline(df_feat): + # TODO check + sf = aa.SequenceFeature() + df_seq = sf.load_sequences(n_in_class=50) + labels = [1 if x == "SUBEXPERT" else 0 for x in df_seq["class"]] + df_parts = sf.get_df_parts(df_seq=df_seq) + cpp = aa.CPP(df_parts=df_parts) + df = df_feat.copy() + cols = [x for x in list(df) if "p_val" not in x] + df = cpp.add_scale_info(df_feat=df) + assert df_feat[cols].equals(df[cols]) + df = cpp.add_stat(df_feat=df, labels=labels) + df = cpp._add_positions(df_feat=df) + assert df_feat[cols].equals(df[cols]) + df = cpp.add_scale_info(df_feat=df) + assert df_feat[cols].equals(df[cols]) + df = cpp._add_positions(df_feat=df) + df = cpp.add_stat(df_feat=df, labels=labels) + assert df_feat[cols].equals(df[cols]) + df = cpp._add_positions(df_feat=df) + assert df_feat[cols].equals(df[cols]) + + +def test_cpp_pipeline(): + pass + + +def test_cpp_with_shap(): + pass diff --git a/tests/unit/test_cpp_feature.py b/tests/unit/test_cpp_feature.py new file mode 100644 index 00000000..9bb77b8e --- /dev/null +++ b/tests/unit/test_cpp_feature.py @@ -0,0 +1,420 @@ +""" +This is a script testing methods of SequenceFeature object +""" +import pandas as pd +import numpy as np +import pytest +from hypothesis import given, settings +import hypothesis.strategies as some + +import aaanalysis as aa + + +# I Unit Tests +class TestLoadScales: + """Unit test for loading scales""" + + # Positive unit test + def test_load_data(self): + """Unit test for aa.SequenceFeature().load_scales() method""" + sf = aa.SequenceFeature() + assert isinstance(sf.load_scales(clust_th=0.5), pd.DataFrame) + + # Negative test + def test_wrong_clustered_values(self): + sf = aa.SequenceFeature() + for i in [0.1, -0.2, "a", None]: + with pytest.raises(ValueError): + sf.load_scales(clust_th=i) + + # Property-based testing + @given(clustered=some.floats(min_value=-10, max_value=10)) + def test_clustered_integer(self, clustered): + sf = aa.SequenceFeature() + if clustered not in [0.5, 0.7]: + with pytest.raises(ValueError): + sf.load_scales(clust_th=clustered) + + +class TestLoadCategories: + """Unit test for loading DataFrame with sequence categories""" + + # Positive unit test + def test_load_categories(self): + sf = aa.SequenceFeature() + assert isinstance(aa.load_scales(clust_th=0.5), pd.DataFrame) + + # Negative test + def test_wrong_clustered_values(self): + sf = aa.SequenceFeature() + for i in [0.1, -0.2, "a", None]: + with pytest.raises(ValueError): + aa.load_scales(clust_th=i) + + # Property-based testing + @given(clustered=some.floats(min_value=-10, max_value=10)) + def test_clustered_integer(self, clustered): + sf = aa.SequenceFeature() + if clustered not in [0.5, 0.7]: + with pytest.raises(ValueError): + aa.load_scales(clust_th=clustered) + + +class TestGetDfParts: + """Unit test for loading DataFrame with sequence parts""" + + # Positive unit test + def test_getting_df_parts_based_on_parts(self, df_seq): + sf = aa.SequenceFeature() + assert isinstance(sf.get_df_parts(df_seq=df_seq), pd.DataFrame) + df = df_seq.drop(["sequence"], axis=1) + assert isinstance(sf.get_df_parts(df_seq=df, list_parts=["tmd"]), pd.DataFrame) + + def test_getting_df_parts_based_on_seq_info(self, df_seq): + sf = aa.SequenceFeature() + assert isinstance(sf.get_df_parts(df_seq=df_seq, jmd_n_len=10, jmd_c_len=10), pd.DataFrame) + df = df_seq.drop(["tmd"], axis=1) + assert isinstance(sf.get_df_parts(df_seq=df, jmd_n_len=10, jmd_c_len=10), pd.DataFrame) + + def test_getting_df_parts_based_on_sequence(self, df_seq): + sf = aa.SequenceFeature() + assert isinstance(sf.get_df_parts(df_seq=df_seq, jmd_n_len=10, jmd_c_len=10), pd.DataFrame) + df = df_seq.drop(["tmd", "tmd_start", "tmd_stop", "jmd_c"], axis=1) + assert isinstance(sf.get_df_parts(df_seq=df, jmd_n_len=10, jmd_c_len=10), pd.DataFrame) + assert isinstance(sf.get_df_parts(df_seq=df, jmd_n_len=0, jmd_c_len=0, ext_len=0), pd.DataFrame) + + # Negative unit tests + def test_wrong_inputs(self, df_seq, df_cat, df_scales): + sf = aa.SequenceFeature() + for i in [None, "a", df_cat, df_scales, 1.1, -1]: + with pytest.raises(ValueError): + sf.get_df_parts(df_seq=i) + with pytest.raises(ValueError): + sf.get_df_parts(df_seq=df_seq, ext_len=i) + for i in ["a", df_cat, df_scales, 1.1, -1]: + with pytest.raises(ValueError): + sf.get_df_parts(df_seq=df_seq, jmd_n_len=i, jmd_c_len=i) + with pytest.raises(ValueError): + sf.get_df_parts(df_seq=df_seq, jmd_n_len=i, jmd_c_len=10) + with pytest.raises(ValueError): + sf.get_df_parts(df_seq=df_seq, jmd_n_len=10, jmd_c_len=i) + + def test_corrupted_df_seq(self, corrupted_df_seq): + sf = aa.SequenceFeature() + with pytest.raises(ValueError): + sf.get_df_parts(df_seq=corrupted_df_seq) # Via parametrized fixtures + + def test_wrong_parameter_combinations(self, df_seq, df_scales): + sf = aa.SequenceFeature() + df = df_seq.drop(["sequence"], axis=1) + with pytest.raises(ValueError): + sf.get_df_parts(df_seq=df, jmd_n_len=10, jmd_c_len=10) + df = df_seq.drop(["tmd"], axis=1) + with pytest.raises(ValueError): + sf.get_df_parts(df_seq=df) + + +class TestGetSplitKws: + """Unit tests for getting split arguments""" + + # Positive unit test + def test_get_split_kws(self, df_cat): + sf = aa.SequenceFeature() + for i in ["Segment", "Pattern", "PeriodicPattern"]: + assert isinstance(sf.get_split_kws(n_split_min=2, steps_pattern=[1, 3, 4], split_types=i), dict) + + # Negative unit tests + def test_wrong_integer_input(self, df_cat): + sf = aa.SequenceFeature() + list_int_args = ["n_split_min", "n_split_max", "n_min", "n_max", "len_max"] + for i in ["a", 1.1, -1, df_cat, dict, None]: + for arg_names in list_int_args: + arg = {arg_names: i} + with pytest.raises(ValueError): + sf.get_split_kws(**arg) + + def test_wrong_ordered_list_input(self, df_cat): + sf = aa.SequenceFeature() + list_args = [[1, None, df_cat], [2, 1], [-1, 9], [0.1, 0.2], ["a", 4]] + for list_arg in list_args: + with pytest.raises(ValueError): + sf.get_split_kws(steps_pattern=list_arg) + with pytest.raises(ValueError): + sf.get_split_kws(steps_periodicpattern=list_arg) + + def test_wrong_combination_of_input(self, df_cat): + sf = aa.SequenceFeature() + with pytest.raises(ValueError): + sf.get_split_kws(n_split_max=4, n_split_min=6) + with pytest.raises(ValueError): + sf.get_split_kws(n_max=4, n_min=6) + with pytest.raises(ValueError): + sf.get_split_kws(n_max=3, n_min=6, len_max=1) + + +class TestFeatures: + """Unit test for creating feature ids""" + + # Positive unit test + def test_features(self, df_scales, list_parts): + sf = aa.SequenceFeature() + split_kws = sf.get_split_kws() + assert isinstance(sf.get_features(), list) + for parts in list_parts: + assert isinstance(sf.get_features(list_parts=parts), list) + for split_type in split_kws: + args = dict(list_parts=parts, df_scales=df_scales, split_kws={split_type: split_kws[split_type]}) + assert isinstance(sf.get_features(**args), list) + + # Negative unit tests + def test_wrong_input(self, df_cat, df_seq): + sf = aa.SequenceFeature() + for wrong_input in [1, -1, "TMD", ["TMD"], [1, 2], ["aa", "a"], [["tmd", "tmd_e"]], df_cat, [df_cat, df_seq]]: + with pytest.raises(ValueError): + sf.get_features(list_parts=wrong_input) + with pytest.raises(ValueError): + sf.get_features(list_parts=["tmd"], df_scales=wrong_input) + with pytest.raises(ValueError): + sf.get_features(list_parts=["tmd"], split_kws=wrong_input) + + def test_corrupted_list_parts(self, corrupted_list_parts): + sf = aa.SequenceFeature() + with pytest.raises(ValueError): + sf.get_features(list_parts=corrupted_list_parts) # Via parametrized fixtures + + def test_corrupted_df_scales(self, corrupted_df_scales): + sf = aa.SequenceFeature() + with pytest.raises(ValueError): + sf.get_features(list_parts=["tmd"], df_scales=corrupted_df_scales) # Via parametrized fixtures + + def test_corrupted_split_kws(self, corrupted_split_kws): + sf = aa.SequenceFeature() + with pytest.raises(ValueError): + sf.get_features(list_parts=["tmd"], split_kws=corrupted_split_kws) # Via parametrized fixtures + + +class TestFeatureName: + """Unit tests for getting feature names""" + + # Positive unit test + def test_feat_name(self, df_feat, df_cat): + sf = aa.SequenceFeature() + assert isinstance(sf.feat_names(features=df_feat["feature"]), list) + assert isinstance(sf.feat_names(features=list(df_feat["feature"])), list) + assert isinstance(sf.feat_names(features=list(df_feat["feature"])[0]), list) + assert isinstance(sf.feat_names(features=df_feat["feature"], df_cat=df_cat), list) + + # Property based testing + @given(tmd_len=some.integers(min_value=15, max_value=100), + jmd_n_len=some.integers(min_value=5, max_value=20), + jmd_c_len=some.integers(min_value=5, max_value=20), + ext_len=some.integers(min_value=1, max_value=4), + start=some.integers(min_value=0, max_value=50)) + @settings(max_examples=10, deadline=None) + def test_feat_name_tmd_len(self, df_feat_module_scope, tmd_len, jmd_n_len, jmd_c_len, ext_len, start): + sf = aa.SequenceFeature() + feat_names = sf.feat_names(features=df_feat_module_scope["feature"], + tmd_len=tmd_len, jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len, + ext_len=ext_len, start=start) + assert isinstance(feat_names, list) + + # Negative unit test + def test_wrong_features(self, wrong_df): + sf = aa.SequenceFeature() + with pytest.raises(ValueError): + feat_names = sf.feat_names(features=wrong_df) + + def test_corrupted_feature(self, df_feat): + sf = aa.SequenceFeature() + for col in df_feat: + if col != "feature": + with pytest.raises(ValueError): + feat_names = sf.feat_names(features=df_feat[col]) + wrong_feat = list(df_feat["feature"])[0] + wrong_feat = "WRONG" + "-" + wrong_feat.split("-")[1] + "-" +wrong_feat.split("-")[2] + with pytest.raises(ValueError): + feat_names = sf.feat_names(features=wrong_feat) + + def test_wrong_df_cat(self, df_feat, wrong_df): + sf = aa.SequenceFeature() + with pytest.raises(ValueError): + feat_names = sf.feat_names(features=df_feat["feature"], df_cat=wrong_df) + + def test_corrupted_df_cat(self, df_cat, df_feat): + sf = aa.SequenceFeature() + df_cat = df_cat[list(df_cat)[0:1]] + with pytest.raises(ValueError): + feat_names = sf.feat_names(features=df_feat["feature"], df_cat=df_cat) + + +class TestFeatureValue: + """Unit tests for getting feature values""" + + # Positive unit test + def test_feature_value(self, df_seq, df_scales, list_parts, list_splits): + sf = aa.SequenceFeature() + df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=True) + for parts in list_parts: + for split in list_splits: + for i in range(0, len(df_scales)): + dict_scale = df_scales.iloc[:, i].to_dict() + x = sf.add_feat_value(split=split, dict_scale=dict_scale, df_parts=df_parts[parts]) + assert isinstance(x, np.ndarray) + + def test_accept_gaps(self, df_seq, list_parts, list_splits, df_scales): + sf = aa.SequenceFeature() + df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=True) + parts, split, dict_scale = list_parts[0], list_splits[0], df_scales.iloc[:, 0].to_dict() + df = df_parts.copy() + args = dict(split=split, dict_scale=dict_scale) + df[parts] = "AAA-CCC" + assert isinstance(sf.add_feat_value(**args, df_parts=df[parts], accept_gaps=True), np.ndarray) + with pytest.raises(ValueError): + sf.add_feat_value(**args, df_parts=df[parts], accept_gaps=False) + df[parts] = "------" + with pytest.raises(ValueError): + sf.add_feat_value(**args, df_parts=df[parts], accept_gaps=True) + args = dict(split=split, df_parts=df_parts[parts]) + dict_scale_na = dict_scale.copy() + dict_scale_na["A"] = np.NaN + assert isinstance(sf.add_feat_value(**args, dict_scale=dict_scale_na, accept_gaps=True), np.ndarray) + with pytest.raises(ValueError): + sf.add_feat_value(**args, dict_scale=dict_scale_na, accept_gaps=False) + + # Negative test + def test_wrong_input(self, df_cat, df_seq, list_parts, list_splits, df_scales): + sf = aa.SequenceFeature() + df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=True) + parts, split, dict_scale = list_parts[0], list_splits[0], df_scales.iloc[:, 0].to_dict() + list_wrong_input = [1, -1, "TMD", ["TMD"], None, [1, 2], ["aa", "a"], [["tmd", "tmd_e"]], + df_cat, [df_cat, df_seq], dict(a=1)] + for wrong_input in list_wrong_input: + with pytest.raises(ValueError): + sf.add_feat_value(split=wrong_input, dict_scale=dict_scale, df_parts=df_parts[parts]) + with pytest.raises(ValueError): + sf.add_feat_value(split=split, dict_scale=wrong_input, df_parts=df_parts[parts]) + with pytest.raises(ValueError): + sf.add_feat_value(split=split, dict_scale=dict_scale, df_parts=wrong_input) + + def test_corrupted_split(self, df_seq, list_parts, df_scales, corrupted_list_splits): + sf = aa.SequenceFeature() + df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=True) + parts, dict_scale = list_parts[0], df_scales.iloc[:, 0].to_dict() + with pytest.raises(ValueError): + # Via parametrized fixtures + sf.add_feat_value(split=corrupted_list_splits, dict_scale=dict_scale, df_parts=df_parts[parts]) + + def test_corrupted_dict_scale(self, df_seq, list_parts, list_splits, df_scales): + sf = aa.SequenceFeature() + df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=True) + parts, split, dict_scale = list_parts[0], list_splits[0], df_scales.iloc[:, 0].to_dict() + dict_scale1 = dict_scale.copy() + dict_scale1["A"] = "A" + dict_scale2 = dict_scale.copy() + dict_scale2.pop("A") + dict_scale3 = dict_scale.copy() + dict_scale3["A"] = dict + wrong_dict_scales = [dict(A=1, B=np.NaN), dict(a=0), dict_scale1, dict_scale2, dict_scale3, dict_scale3] + for d in wrong_dict_scales: + with pytest.raises(ValueError): + sf.add_feat_value(split=split, dict_scale=d, df_parts=df_parts[parts]) + + def test_corrupted_df_parts(self, list_splits, df_scales, corrupted_df_parts): + sf = aa.SequenceFeature() + split, dict_scale = list_splits[0], df_scales.iloc[:, 0].to_dict() + with pytest.raises(ValueError): + # Via parametrized fixtures + sf.add_feat_value(split=split, dict_scale=dict_scale, df_parts=corrupted_df_parts) + + +class TestFeatureMatrix: + """Unit tests for getting feature matrix""" + + # Positive unit test + def test_feature_matrix(self, df_seq, df_scales): + sf = aa.SequenceFeature() + df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=True) + features = sf.get_features()[0:100] + feat_matrix = sf.feat_matrix(df_parts=df_parts, df_scales=df_scales, features=features) + assert isinstance(feat_matrix, np.ndarray) + assert feat_matrix.shape == (len(df_seq), len(features)) + feat_matrix = sf.feat_matrix(df_parts=df_parts, features=features) + assert isinstance(feat_matrix, np.ndarray) + + # Negative test + def test_missing_parameters(self, df_scales, df_seq): + sf = aa.SequenceFeature() + df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=True) + features = sf.get_features()[0:100] + with pytest.raises(ValueError): + sf.feat_matrix(df_parts=df_parts) + with pytest.raises(ValueError): + sf.feat_matrix(features=features) + with pytest.raises(ValueError): + sf.feat_matrix(df_scales=df_scales) + with pytest.raises(ValueError): + sf.feat_matrix(df_parts=df_parts, df_scales=df_scales) + with pytest.raises(ValueError): + sf.feat_matrix(df_scales=df_scales, features=features) + + def test_wrong_input(self, df_cat, df_seq, df_scales): + sf = aa.SequenceFeature() + df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=True) + features = sf.get_features()[0:100] + list_wrong_input = [1, -1, "TMD", ["TMD"], None, [1, 2], ["aa", "a"], + [["tmd", "tmd_e"]], df_cat, [df_cat, df_seq], dict(a=1)] + for wrong_input in list_wrong_input: + with pytest.raises(ValueError): + sf.feat_matrix(df_parts=df_parts, df_scales=df_scales, features=wrong_input) + if wrong_input is not None: + with pytest.raises(ValueError): + sf.feat_matrix(df_parts=df_parts, df_scales=wrong_input, features=features) + with pytest.raises(ValueError): + sf.feat_matrix(df_parts=wrong_input, df_scales=df_scales, features=features) + + def test_corrupted_df_parts(self, corrupted_df_parts, df_scales): + sf = aa.SequenceFeature() + features = sf.get_features()[0:100] + with pytest.raises(ValueError): + # Via parametrized fixtures + sf.feat_matrix(df_parts=corrupted_df_parts, df_scales=df_scales, features=features) + + def test_corrupted_df_scales(self, corrupted_df_scales, df_seq): + sf = aa.SequenceFeature() + df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=True) + features = sf.get_features()[0:100] + with pytest.raises(ValueError): + # Via parametrized fixtures + sf.feat_matrix(df_parts=df_parts, df_scales=corrupted_df_scales, features=features) + + def test_corrupted_features(self, df_scales, df_seq): + sf = aa.SequenceFeature() + df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=True) + features = sf.get_features()[0:100] + corrupted_features = [features[0:5] + [np.NaN], features[0:3] + ["Test"], + "a", + [[features[0:4]]], + [x.upper() for x in features[0:5]], + [x[0:5] for x in features[0:5]], + ["a".join(x.split("-")) for x in features[0:6]]] + for features in corrupted_features: + with pytest.raises(ValueError): + sf.feat_matrix(df_parts=df_parts, df_scales=df_scales, features=features) + + +# II Regression test (Functional test) +def test_sequence_feature(list_splits): + """Positive regression/functional test of all aa.SequenceFeature() methods""" + sf = aa.SequenceFeature() + # Get test set of sequences + df_seq = sf.load_sequences() + # Get feature components + df_parts = sf.get_df_parts(df_seq=df_seq, all_parts=False) + df_scales = sf.load_scales() + split_kws = sf.get_split_kws() + # Get features (names, values, matrix) + features = sf.get_features()[0:100] + feat_matrix = sf.feat_matrix(df_parts=df_parts, df_scales=df_scales, features=features) + assert isinstance(feat_matrix, np.ndarray) + assert feat_matrix.shape == (len(df_seq), len(features)) + diff --git a/tests/unit/test_data_loader.py b/tests/unit/test_data_loader.py new file mode 100644 index 00000000..ca838b9a --- /dev/null +++ b/tests/unit/test_data_loader.py @@ -0,0 +1,135 @@ +""" +This is a script for ... +""" +from hypothesis import given, settings, example +import hypothesis.strategies as some +import aaanalysis.utils as ut +import aaanalysis as aa +from pandas import DataFrame +import pytest + + +class TestLoadDataset: + """Test load_dataset function""" + + # Property-based testing for positive cases + def test_df_seq_output_columns(self): + """""" + all_data_set_names = aa.load_dataset()["Dataset"].to_list() + for name in all_data_set_names: + df = aa.load_dataset(name=name) + assert set(ut.COLS_SEQ_KEY).issubset(set(df)) + + @given(n=some.integers(min_value=1, max_value=100)) + def test_load_dataset_n_value(self, n): + """Test the 'n' parameter for limiting rows.""" + max_n = aa.load_dataset(name="SEQ_LOCATION")["label"].value_counts().min() + if max_n > n: + df = aa.load_dataset(name="SEQ_LOCATION", n=n) + assert len(df) == (n * 2) + + @given(min_len=some.integers(min_value=400, max_value=1000)) + def test_load_dataset_min_len(self, min_len): + """Test the 'min_len' parameter for filtering sequences.""" + df = aa.load_dataset(name="SEQ_LOCATION", min_len=min_len) + assert all(len(seq) >= min_len for seq in df[ut.COL_SEQ]) + + @given(max_len=some.integers(min_value=50, max_value=100)) + def test_load_dataset_max_len(self, max_len): + """Test the 'max_len' parameter for filtering sequences.""" + df = aa.load_dataset(name="SEQ_LOCATION", max_len=max_len) + assert all(len(seq) <= max_len for seq in df[ut.COL_SEQ]) + + # Property-based testing for negative cases + @given(n=some.integers(max_value=0)) + def test_load_dataset_invalid_n(self, n): + """Test with an invalid 'n' value.""" + with pytest.raises(ValueError): + aa.load_dataset(name="SEQ_LOCATION", n=n) + + @given(min_len=some.integers(max_value=0)) + def test_load_dataset_invalid_min_len(self, min_len): + """Test with an invalid 'min_len' value.""" + with pytest.raises(ValueError): + aa.load_dataset(name="SEQ_LOCATION", min_len=min_len) + with pytest.raises(ValueError): + aa.load_dataset(name="SEQ_AMYLO", min_len=10) + + @given(max_len=some.integers(max_value=0)) + def test_load_dataset_invalid_max_len(self, max_len): + """Test with an invalid 'max_len' value.""" + with pytest.raises(ValueError): + aa.load_dataset(name="SEQ_LOCATION", max_len=max_len) + + # Additional Negative Tests + @given(n=some.integers(min_value=1000, max_value=1050)) + def test_load_dataset_n_value_too_high(self, n): + """Test the 'n' parameter for limiting rows.""" + max_n = aa.load_dataset(name="SEQ_LOCATION")["label"].value_counts().min() + if max_n < n: + with pytest.raises(ValueError): + df = aa.load_dataset(name="SEQ_LOCATION", n=n) + + @given(negative_n=some.integers(min_value=-100, max_value=-1)) + def test_load_dataset_negative_n(self, negative_n): + """Test with a negative 'n' value.""" + with pytest.raises(ValueError): + aa.load_dataset(name="SEQ_LOCATION", n=negative_n) + + @given(non_canonical_aa=some.text()) + @example(non_canonical_aa="invalid_option") + def test_load_dataset_invalid_non_canonical_aa(self, non_canonical_aa): + """Test with an invalid 'non_canonical_aa' value.""" + if non_canonical_aa not in ["remove", "keep", "gap"]: + with pytest.raises(ValueError): + aa.load_dataset(name="SEQ_LOCATION", non_canonical_aa=non_canonical_aa) + + +class TestLoadDatasetComplex: + """Test load_dataset function with complex scenarios""" + + def test_load_dataset_n_and_min_len(self): + """Test the 'n' and 'min_len' parameters together.""" + df = aa.load_dataset(name="SEQ_LOCATION", n=10, min_len=5) + assert len(df) == 10 * 2 + assert all(len(seq) >= 5 for seq in df[ut.COL_SEQ]) + + def test_load_dataset_n_and_max_len(self): + """Test the 'n' and 'max_len' parameters together.""" + df = aa.load_dataset(name="SEQ_LOCATION", n=10, max_len=200) + assert len(df) == 10 * 2 + assert all(len(seq) <= 200 for seq in df[ut.COL_SEQ]) + + def test_load_dataset_min_max_len(self): + """Test both 'min_len' and 'max_len' together.""" + df = aa.load_dataset(name="SEQ_LOCATION", min_len=5, max_len=200) + assert all(5 <= len(seq) <= 200 for seq in df[ut.COL_SEQ]) + + def test_load_dataset_min_max_len_and_n(self): + """Test 'min_len', 'max_len', and 'n' together.""" + df = aa.load_dataset(name="SEQ_LOCATION", min_len=5, max_len=200, n=10) + assert len(df) == 10 * 2 + assert all(5 <= len(seq) <= 200 for seq in df[ut.COL_SEQ]) + + def test_load_dataset_all_filters(self): + """Test all filters together ('n', 'min_len', 'max_len', 'non_canonical_aa').""" + df = aa.load_dataset(name="SEQ_LOCATION", n=10, min_len=5, max_len=200, non_canonical_aa="remove") + assert len(df) == 10 * 2 + assert all(5 <= len(seq) <= 200 for seq in df[ut.COL_SEQ]) + # Add your assertion to check if non-canonical amino acids are removed + + def test_load_dataset_invalid_min_max_len(self): + """Test with 'min_len' greater than 'max_len'.""" + with pytest.raises(ValueError): + aa.load_dataset(name="SEQ_LOCATION", min_len=10, max_len=5) + + def test_load_dataset_invalid_min_max_len_and_n(self): + """Test with 'min_len' greater than 'max_len' and a valid 'n'.""" + with pytest.raises(ValueError): + aa.load_dataset(name="SEQ_LOCATION", min_len=10, max_len=5, n=10) + + def test_load_dataset_invalid_all_filters(self): + """Test with all invalid filters ('n', 'min_len', 'max_len', 'non_canonical_aa').""" + with pytest.raises(ValueError): + aa.load_dataset(name="SEQ_LOCATION", n=-1, min_len=10, max_len=5, non_canonical_aa="invalid_option") + diff --git a/tests/unit/test_dpulearn.py b/tests/unit/test_dpulearn.py new file mode 100644 index 00000000..2463d63c --- /dev/null +++ b/tests/unit/test_dpulearn.py @@ -0,0 +1,31 @@ +""" +This is a script for ... +""" +import time +import pandas as pd +import numpy as np + + +# Settings +pd.set_option('expand_frame_repr', False) # Single line print for pd.Dataframe + + +# I Helper Functions + + +# II Main Functions + + +# III Test/Caller Functions + + +# IV Main +def main(): + t0 = time.time() + + t1 = time.time() + print("Time:", t1 - t0) + + +if __name__ == "__main__": + main() diff --git a/tests/unit/test_plotting.py b/tests/unit/test_plotting.py new file mode 100644 index 00000000..2463d63c --- /dev/null +++ b/tests/unit/test_plotting.py @@ -0,0 +1,31 @@ +""" +This is a script for ... +""" +import time +import pandas as pd +import numpy as np + + +# Settings +pd.set_option('expand_frame_repr', False) # Single line print for pd.Dataframe + + +# I Helper Functions + + +# II Main Functions + + +# III Test/Caller Functions + + +# IV Main +def main(): + t0 = time.time() + + t1 = time.time() + print("Time:", t1 - t0) + + +if __name__ == "__main__": + main()