Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decouple dataset from plots #332

Merged
merged 15 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
299 changes: 279 additions & 20 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,21 @@
import logging
import warnings
import plotly
import scipy

from alphastats import BaseLoader


from alphastats.DataSet_Plot import Plot
from alphastats.DataSet_Preprocess import Preprocess, PreprocessingStateKeys
from alphastats.DataSet_Preprocess import Preprocess
from alphastats.DataSet_Statistics import Statistics
from alphastats.utils import LoaderError
from alphastats.utils import LoaderError, check_for_missing_values, ignore_warning
from alphastats.statistics.tukey_test import tukey_test
from alphastats.plots.ClusterMap import ClusterMap
from alphastats.plots.DimensionalityReduction import DimensionalityReduction
from alphastats.plots.IntensityPlot import IntensityPlot
from alphastats.plots.SampleHistogram import SampleHistogram
from alphastats.plots.VolcanoPlot import VolcanoPlot

plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template(
layout=plotly.graph_objects.Layout(
Expand All @@ -36,8 +42,8 @@
plotly.io.templates.default = "simple_white+alphastats_colors"


class DataSet(Plot):
"""Analysis Object"""
class DataSet:
"""The main object of AlphaPeptStats, providing an interface to all relevant functionality and data."""

def __init__(
self,
Expand Down Expand Up @@ -160,11 +166,11 @@ def batch_correction(self, batch: str) -> None:
def _get_statistics(self) -> Statistics:
"""Return instance of the Statistics object."""
return Statistics(
self.mat,
self.metadata,
self.index_column,
self.sample,
self.preprocessing_info,
mat=self.mat,
metadata=self.metadata,
index_column=self.index_column,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
)

def diff_expression_analysis(
Expand Down Expand Up @@ -208,6 +214,270 @@ def ancova(
"""A wrapper for Statistics.ancova(), see documentation there."""
return self._get_statistics().ancova(protein_id, covar, between)

@check_for_missing_values
def plot_pca(self, group: Optional[str] = None, circle: bool = False):
"""Plot Principal Component Analysis (PCA)

Args:
group (str, optional): column in metadata that should be used for coloring. Defaults to None.
circle (bool, optional): draw circle around each group. Defaults to False.

Returns:
plotly.graph_objects._figure.Figure: PCA plot
"""
dimensionality_reduction = DimensionalityReduction(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
group=group,
circle=circle,
method="pca",
)
return dimensionality_reduction.plot

@check_for_missing_values
def plot_tsne(
self,
group: Optional[str] = None,
circle: bool = False,
perplexity: int = 5,
n_iter: int = 1000,
):
"""Plot t-distributed stochastic neighbor embedding (t-SNE)

Args:
group (str, optional): column in metadata that should be used for coloring. Defaults to None.
circle (bool, optional): draw circle around each group. Defaults to False.

Returns:
plotly.graph_objects._figure.Figure: t-SNE plot
"""
dimensionality_reduction = DimensionalityReduction(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
group=group,
method="tsne",
circle=circle,
perplexity=perplexity,
n_iter=n_iter,
)
return dimensionality_reduction.plot

@check_for_missing_values
def plot_umap(self, group: Optional[str] = None, circle: bool = False):
"""Plot Uniform Manifold Approximation and Projection for Dimension Reduction

Args:
group (str, optional): column in metadata that should be used for coloring. Defaults to None.
circle (bool, optional): draw circle around each group. Defaults to False.

Returns:
plotly.graph_objects._figure.Figure: UMAP plot
"""
dimensionality_reduction = DimensionalityReduction(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
preprocessing_info=self.preprocessing_info,
group=group,
method="umap",
circle=circle,
)
return dimensionality_reduction.plot

@ignore_warning(RuntimeWarning)
def plot_volcano(
self,
group1: Union[str, list],
group2: Union[str, list],
column: str = None,
method: str = "ttest",
labels: bool = False,
min_fc: float = 1.0,
alpha: float = 0.05,
draw_line: bool = True,
perm: int = 100,
fdr: float = 0.05,
# compare_preprocessing_modes: bool = False, # TODO reimplement
color_list: list = [],
):
"""Plot Volcano Plot

Args:
column (str): column name in the metadata file with the two groups to compare
group1 (str/list): name of group to compare needs to be present in column or list of sample names to compare
group2 (str/list): name of group to compare needs to be present in column or list of sample names to compare
method (str): "anova", "wald", "ttest", "SAM" Defaul ttest.
labels (bool): Add text labels to significant Proteins, Default False.
alpha(float,optional): p-value cut off.
min_fc (float): Minimum fold change.
draw_line(boolean): whether to draw cut off lines.
perm(float,optional): number of permutations when using SAM as method. Defaults to 100.
fdr(float,optional): FDR cut off when using SAM as method. Defaults to 0.05.
color_list (list): list with ProteinIDs that should be highlighted.
compare_preprocessing_modes(bool): Will iterate through normalization and imputation modes and return a list of VolcanoPlots in different settings, Default False.


Returns:
plotly.graph_objects._figure.Figure: Volcano Plot
"""

# TODO this needs to orchestrated from outside this method
# if compare_preprocessing_modes:
# params_for_func = locals()
# results = self._compare_preprocessing_modes(
# func=VolcanoPlot, params_for_func=params_for_func
# )
# return results
#
# else:
volcano_plot = VolcanoPlot(
mat=self.mat,
rawinput=self.rawinput,
metadata=self.metadata,
sample=self.sample,
index_column=self.index_column,
gene_names=self.gene_names,
preprocessing_info=self.preprocessing_info,
group1=group1,
group2=group2,
column=column,
method=method,
labels=labels,
min_fc=min_fc,
alpha=alpha,
draw_line=draw_line,
perm=perm,
fdr=fdr,
color_list=color_list,
)

return volcano_plot.plot

def plot_intensity(
self,
protein_id: str,
group: str = None,
subgroups: list = None,
method: str = "box",
add_significance: bool = False,
log_scale: bool = False,
# compare_preprocessing_modes: bool = False, TODO reimplement
):
"""Plot Intensity of individual Protein/ProteinGroup

Args:
protein_id (str): ProteinGroup ID
group (str, optional): A metadata column used for grouping. Defaults to None.
subgroups (list, optional): Select variables from the group column. Defaults to None.
method (str, optional): Violinplot = "violin", Boxplot = "box", Scatterplot = "scatter" or "all". Defaults to "box".
add_significance (bool, optional): add p-value bar, only possible when two groups are compared. Defaults False.
log_scale (bool, optional): yaxis in logarithmic scale. Defaults to False.

Returns:
plotly.graph_objects._figure.Figure: Plotly Plot
"""
# TODO this needs to orchestrated from outside this method
# if compare_preprocessing_modes:
# params_for_func = locals()
# results = self._compare_preprocessing_modes(
# func=IntensityPlot, params_for_func=params_for_func
# )
# return results

intensity_plot = IntensityPlot(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
intensity_column=self.intensity_column,
preprocessing_info=self.preprocessing_info,
protein_id=protein_id,
group=group,
subgroups=subgroups,
method=method,
add_significance=add_significance,
log_scale=log_scale,
)

return intensity_plot.plot

@ignore_warning(UserWarning)
@check_for_missing_values
def plot_clustermap(
self,
label_bar: str = None,
only_significant: bool = False,
group: str = None,
subgroups: list = None,
):
"""Plot a matrix dataset as a hierarchically-clustered heatmap

Args:
label_bar (str, optional): column/variable name described in the metadata. Will be plotted as bar above the heatmap to see wheteher groups are clustering together. Defaults to None.. Defaults to None.
only_significant (bool, optional): performs ANOVA and only signficantly different proteins will be clustered (p<0.05). Defaults to False.
group (str, optional): group containing subgroups that should be clustered. Defaults to None.
subgroups (list, optional): variables in group that should be plotted. Defaults to None.

Returns:
ClusterGrid: Clustermap
"""

clustermap = ClusterMap(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
index_column=self.index_column,
preprocessing_info=self.preprocessing_info,
label_bar=label_bar,
only_significant=only_significant,
group=group,
subgroups=subgroups,
)
return clustermap.plot

def plot_samplehistograms(self):
"""Plots the density distribution of each sample

Returns:
plotly: Plotly Graph Object
"""
return SampleHistogram(mat=self.mat).plot()

def _get_plot(self) -> Plot:
"""Get instance of the Plot object."""
return Plot(
self.mat,
self.rawmat,
self.metadata,
self.sample,
self.preprocessing_info,
)

def plot_correlation_matrix(self, method: str = "pearson"): # TODO unused
"""A wrapper for Plot.plot_correlation_matrix(), see documentation there."""
return self._get_plot().plot_correlation_matrix(method)

def plot_sampledistribution(
self,
method: str = "violin",
color: str = None,
log_scale: bool = False,
use_raw: bool = False,
):
"""A wrapper for Plot.plot_sampledistribution(), see documentation there."""
return self._get_plot().plot_sampledistribution(
method, color, log_scale, use_raw
)

def plot_dendrogram(
self, linkagefun=lambda x: scipy.cluster.hierarchy.linkage(x, "complete")
):
"""A wrapper for Plot.plot_dendrogram(), see documentation there."""
return self._get_plot().plot_dendrogram(linkagefun)

def _check_loader(self, loader):
"""Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader

Expand Down Expand Up @@ -247,17 +517,6 @@ def _remove_misc_samples_in_metadata(self):
"are removed from the metadata."
)

# TODO this is implemented in both preprocessing and here
# This is only needed in the DimensionalityReduction class and only if the step was not run during preprocessing.
# idea: replace the step in DimensionalityReduction with something like:
# mat = self.data.mat.loc[sample_names,:] after creating sample_names.
def _subset(self):
# filter matrix so only samples that are described in metadata are also found in matrix
self.preprocessing_info.update(
{PreprocessingStateKeys.NUM_SAMPLES: self.metadata.shape[0]}
)
return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())]

def _create_matrix(self):
"""
Creates a matrix of the Outputfile, with columns displaying features (Proteins) and
Expand Down
Loading
Loading