Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft refactor dataset #322

Merged
merged 9 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 94 additions & 27 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List, Union, Dict, Optional

import pandas as pd
import numpy as np
import logging
Expand Down Expand Up @@ -34,10 +36,15 @@
plotly.io.templates.default = "simple_white+alphastats_colors"


class DataSet(Preprocess, Statistics, Plot, Enrichment):
class DataSet(Statistics, Plot, Enrichment):
"""Analysis Object"""

def __init__(self, loader, metadata_path=None, sample_column=None):
def __init__(
self,
loader: BaseLoader,
metadata_path: Optional[str] = None,
sample_column: Optional[str] = None,
):
"""Create DataSet

Args:
Expand All @@ -47,46 +54,93 @@ def __init__(self, loader, metadata_path=None, sample_column=None):

"""
self._check_loader(loader=loader)
#  load data from loader object
self.loader = loader
self.rawinput = loader.rawinput
self.software = loader.software
self.index_column = loader.index_column
self.intensity_column = loader.intensity_column
self.filter_columns = loader.filter_columns
self.evidence_df = loader.evidence_df
self.gene_names = loader.gene_names

self.rawinput: pd.DataFrame = loader.rawinput
self.software: str = loader.software
self.index_column: str = loader.index_column
self.intensity_column: Union[str, list] = loader.intensity_column
self.filter_columns: List[str] = loader.filter_columns
self.evidence_df: pd.DataFrame = loader.evidence_df
self.gene_names: str = loader.gene_names

# include filtering before
self.create_matrix()
self._check_matrix_values()
self.metadata = None

self.metadata: pd.DataFrame
self.sample: str
if metadata_path is not None:
self.sample = sample_column
self.load_metadata(file_path=metadata_path)
self.metadata = self.load_metadata(file_path=metadata_path)
self._remove_misc_samples_in_metadata()

else:
self._create_metadata()
self.sample = "sample"
self.metadata = pd.DataFrame({"sample": list(self.mat.index)})
mschwoer marked this conversation as resolved.
Show resolved Hide resolved

if self.loader == "Generic":
if loader == "Generic":
intensity_column = loader._extract_sample_names(
metadata=self.metadata, sample_column=self.sample
)
self.intensity_column = intensity_column

# save preprocessing settings
self.preprocessing_info = self._save_dataset_info()
self.preprocessed = False
self.preprocessing_info: Dict = self._save_dataset_info()
self.preprocessed: bool = False

print("DataSet has been created.")
self.overview()

def _create_metadata(self):
samples = list(self.mat.index)
self.metadata = pd.DataFrame({"sample": samples})
self.sample = "sample"
def preprocess(
self,
log2_transform: bool = True,
remove_contaminations: bool = False,
subset: bool = False,
data_completeness: float = 0,
normalization: str = None,
imputation: str = None,
remove_samples: list = None,
**kwargs,
) -> None:
"""A wrapper for the preprocess() method, see documentation in Preprocess.preprocess()."""
pp = Preprocess(
self.filter_columns,
self.rawinput,
self.index_column,
self.sample,
self.metadata,
self.preprocessing_info,
self.mat,
)

self.mat, self.metadata, self.preprocessing_info = pp.preprocess(
log2_transform,
remove_contaminations,
subset,
data_completeness,
normalization,
imputation,
remove_samples,
**kwargs,
)
self.preprocessed = True

def reset_preprocessing(self):
"""Reset all preprocessing steps"""
self.create_matrix()
# TODO fix bug: metadata is not reset here
print("All preprocessing steps are reset.")

def batch_correction(self, batch: str) -> None:
pp = Preprocess(
self.filter_columns,
self.rawinput,
self.index_column,
self.sample,
self.metadata,
self.preprocessing_info,
self.mat,
)
self.mat = pp.batch_correction(batch)

def _check_loader(self, loader):
"""Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader
Expand Down Expand Up @@ -127,6 +181,17 @@ def _remove_misc_samples_in_metadata(self):
"are removed from the metadata."
)

# TODO this is implemented in both preprocessing and here
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
# This is only needed in the DimensionalityReduction class and only if the step was not run during preprocessing.
# idea: replace the step in DimensionalityReduction with something like:
# mat = self.data.mat.loc[sample_names,:] after creating sample_names.
def _subset(self):
# filter matrix so only samples that are described in metadata are also found in matrix
self.preprocessing_info.update(
{"Matrix: Number of samples": self.metadata.shape[0]}
)
return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())]

def create_matrix(self):
"""
Creates a matrix of the Outputfile, with columns displaying features (Proteins) and
Expand All @@ -147,26 +212,28 @@ def create_matrix(self):

else:
df = df[self.intensity_column]

# transpose dataframe
mat = df.transpose()
mat.replace([np.inf, -np.inf], np.nan, inplace=True)
# remove proteins with only zero

# remove proteins with only zero # TODO this is re-done in preprocessing
self.mat = mat.loc[:, (mat != 0).any(axis=0)]
self.mat = self.mat.astype(float)

# reset preproccessing info
self.preprocessing_info = self._save_dataset_info()
self.preprocessed = False
self.rawmat = mat

def load_metadata(self, file_path):
def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame:
"""Load metadata either xlsx, txt, csv or txt file

Args:
file_path (str): path to metadata file
file_path: path to metadata file or metadata DataFrame # TODO disentangle this
"""
if isinstance(file_path, pd.DataFrame):
df = file_path
#  loading file needs to be more beautiful
elif file_path.endswith(".xlsx"):
warnings.filterwarnings(
"ignore",
Expand All @@ -193,7 +260,7 @@ def load_metadata(self, file_path):
# check whether sample labeling matches protein data
# warnings.warn("WARNING: Sample names do not match sample labelling in protein data")
df.columns = df.columns.astype(str)
self.metadata = df
return df

def _save_dataset_info(self):
n_proteingroups = self.mat.shape[1]
Expand Down
64 changes: 33 additions & 31 deletions alphastats/DataSet_Plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def plot_volcano(
draw_line: bool = True,
perm: int = 100,
fdr: float = 0.05,
compare_preprocessing_modes: bool = False,
# compare_preprocessing_modes: bool = False, # TODO reimplement
color_list: list = [],
):
"""Plot Volcano Plot
Expand All @@ -159,30 +159,31 @@ def plot_volcano(
plotly.graph_objects._figure.Figure: Volcano Plot
"""

if compare_preprocessing_modes:
params_for_func = locals()
results = self._compare_preprocessing_modes(
func=VolcanoPlot, params_for_func=params_for_func
)
return results

else:
volcano_plot = VolcanoPlot(
dataset=self,
group1=group1,
group2=group2,
column=column,
method=method,
labels=labels,
min_fc=min_fc,
alpha=alpha,
draw_line=draw_line,
perm=perm,
fdr=fdr,
color_list=color_list,
)
# TODO this needs to orchestrated from outside this method
# if compare_preprocessing_modes:
# params_for_func = locals()
# results = self._compare_preprocessing_modes(
# func=VolcanoPlot, params_for_func=params_for_func
# )
# return results
#
# else:
volcano_plot = VolcanoPlot(
dataset=self,
group1=group1,
group2=group2,
column=column,
method=method,
labels=labels,
min_fc=min_fc,
alpha=alpha,
draw_line=draw_line,
perm=perm,
fdr=fdr,
color_list=color_list,
)

return volcano_plot.plot
return volcano_plot.plot

def plot_correlation_matrix(self, method: str = "pearson"):
"""Plot Correlation Matrix
Expand Down Expand Up @@ -266,7 +267,7 @@ def plot_intensity(
method: str = "box",
add_significance: bool = False,
log_scale: bool = False,
compare_preprocessing_modes: bool = False,
# compare_preprocessing_modes: bool = False, TODO reimplement
):
"""Plot Intensity of individual Protein/ProteinGroup

Expand All @@ -281,12 +282,13 @@ def plot_intensity(
Returns:
plotly.graph_objects._figure.Figure: Plotly Plot
"""
if compare_preprocessing_modes:
params_for_func = locals()
results = self._compare_preprocessing_modes(
func=IntensityPlot, params_for_func=params_for_func
)
return results
# TODO this needs to orchestrated from outside this method
# if compare_preprocessing_modes:
# params_for_func = locals()
# results = self._compare_preprocessing_modes(
# func=IntensityPlot, params_for_func=params_for_func
# )
# return results

intensity_plot = IntensityPlot(
dataset=self,
Expand Down
Loading
Loading