Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

simplify-dataset-init #333

Merged
merged 7 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
235 changes: 80 additions & 155 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Union, Dict, Optional
from typing import List, Union, Dict, Optional, Tuple

import pandas as pd
import numpy as np
Expand All @@ -20,6 +20,7 @@
from alphastats.plots.IntensityPlot import IntensityPlot
from alphastats.plots.SampleHistogram import SampleHistogram
from alphastats.plots.VolcanoPlot import VolcanoPlot
from alphastats.dataset_factory import DataSetFactory

plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template(
layout=plotly.graph_objects.Layout(
Expand Down Expand Up @@ -48,14 +49,14 @@ class DataSet:
def __init__(
self,
loader: BaseLoader,
metadata_path: Optional[str] = None,
metadata_path: Optional[Union[str, pd.DataFrame]] = None,
sample_column: Optional[str] = None,
):
"""Create DataSet

Args:
loader (_type_): loader of class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader
metadata_path (str, optional): path to metadata file. Defaults to None.
metadata_path (str or pd.DataFrame, optional): path to metadata file or an actual df. Defaults to None.
sample_column (str, optional): column in metadata file indicating the sample IDs. Defaults to None.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
metadata_path (str or pd.DataFrame, optional): path to metadata file or an actual df. Defaults to None.
metadata_path_or_df (str or pd.DataFrame, optional): path to metadata file or an actual df. Defaults to None.

sorry to be this person hahaha

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is indeed a good suggestion to make this api better .. in the long run, I'd implement two distinct parameters (path and df) .. did the whole change here: #336


Attributes of a DataSet instance:
Expand All @@ -66,47 +67,81 @@ def __init__(
"""
self._check_loader(loader=loader)

# fill data from loader
self.rawinput: pd.DataFrame = loader.rawinput
self.software: str = loader.software
self.index_column: str = loader.index_column
self.intensity_column: Union[str, list] = loader.intensity_column
self.filter_columns: List[str] = loader.filter_columns
self.evidence_df: pd.DataFrame = loader.evidence_df
self.gene_names: str = loader.gene_names

# include filtering before
self._create_matrix()
self._check_matrix_values()

self.metadata: pd.DataFrame
self.sample: str
if metadata_path is not None:
self.sample = sample_column
self.metadata = self._load_metadata(file_path=metadata_path)
self._remove_misc_samples_in_metadata()
else:
self.sample = "sample"
self.metadata = pd.DataFrame({"sample": list(self.mat.index)})

if loader == "Generic":
intensity_column = loader._extract_sample_names(
self.index_column: str = loader.index_column
self.software: str = loader.software
self._gene_names: str = loader.gene_names

self._intensity_column: Union[str, list] = (
loader._extract_sample_names(
Copy link
Contributor Author

@mschwoer mschwoer Sep 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note that this was different before:
loader.intensity_column was used to create the mat object, and only afterwards overwritten in the case of a generic loader .. I suppose this was not intended

metadata=self.metadata, sample_column=self.sample
)
self.intensity_column = intensity_column
if loader == "Generic"
else loader.intensity_column
)

# init preprocessing settings
self.preprocessing_info: Dict = Preprocess.init_preprocessing_info(
num_samples=self.mat.shape[0],
num_protein_groups=self.mat.shape[1],
intensity_column=self.intensity_column,
filter_columns=self.filter_columns,
# self.evidence_df: pd.DataFrame = loader.evidence_df # TODO unused

self.dataset_factory = DataSetFactory(
rawinput=self.rawinput,
index_column=self.index_column,
intensity_column=self._intensity_column,
metadata_path=metadata_path,
sample_column=sample_column,
)

self.preprocessed = False
self.preprocessed: bool = False
rawmat, mat, metadata, sample, preprocessing_info, preprocessed = (
self._init_dataset()
)
self.rawmat: pd.DataFrame = rawmat
self.mat: pd.DataFrame = mat
self.metadata: pd.DataFrame = metadata
self.sample: str = sample
self.preprocessing_info: Dict = preprocessing_info
self._preprocessed: bool = preprocessed

print("DataSet has been created.")

def _init_dataset(self):
rawmat, mat = self.dataset_factory.create_matrix_from_rawinput()

metadata, sample = self.dataset_factory.create_metadata(mat)

preprocessing_info = Preprocess.init_preprocessing_info(
num_samples=mat.shape[0],
num_protein_groups=mat.shape[1],
intensity_column=self._intensity_column,
filter_columns=self.filter_columns,
)

preprocessed = False # TODO could be moved to preprocessing_info dict

return rawmat, mat, metadata, sample, preprocessing_info, preprocessed

def _check_loader(self, loader):
"""Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader

Args:
loader : loader
"""
if not isinstance(loader, BaseLoader):
raise LoaderError(
"loader must be a subclass of BaseLoader, "
f"got {loader.__class__.__name__}"
)

if not isinstance(loader.rawinput, pd.DataFrame) or loader.rawinput.empty:
raise ValueError(
"Error in rawinput, consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader"
)

if not isinstance(loader.index_column, str):
raise ValueError(
"Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader"
)

def _get_preprocess(self) -> Preprocess:
"""Return instance of the Preprocess object."""
return Preprocess(
Expand Down Expand Up @@ -143,21 +178,18 @@ def preprocess(
**kwargs,
)
)
self.preprocessed = True
self._preprocessed = True

def reset_preprocessing(self):
"""Reset all preprocessing steps"""
self._create_matrix()
self.preprocessing_info = Preprocess.init_preprocessing_info(
num_samples=self.mat.shape[0],
num_protein_groups=self.mat.shape[1],
intensity_column=self.intensity_column,
filter_columns=self.filter_columns,
)

self.preprocessed = False
# TODO fix bug: metadata is not reset/reloaded here
print("All preprocessing steps are reset.")
(
self.rawmat,
self.mat,
self.metadata,
self.sample,
self.preprocessing_info,
self._preprocessed,
) = self._init_dataset()

def batch_correction(self, batch: str) -> None:
"""A wrapper for Preprocess.batch_correction(), see documentation there."""
Expand Down Expand Up @@ -340,7 +372,7 @@ def plot_volcano(
metadata=self.metadata,
sample=self.sample,
index_column=self.index_column,
gene_names=self.gene_names,
gene_names=self._gene_names,
preprocessing_info=self.preprocessing_info,
group1=group1,
group2=group2,
Expand Down Expand Up @@ -392,7 +424,7 @@ def plot_intensity(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
intensity_column=self.intensity_column,
intensity_column=self._intensity_column,
preprocessing_info=self.preprocessing_info,
protein_id=protein_id,
group=group,
Expand Down Expand Up @@ -477,110 +509,3 @@ def plot_dendrogram(
):
"""A wrapper for Plot.plot_dendrogram(), see documentation there."""
return self._get_plot().plot_dendrogram(linkagefun)

def _check_loader(self, loader):
"""Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader

Args:
loader : loader
"""
if not isinstance(loader, BaseLoader):
raise LoaderError(
"loader must be a subclass of BaseLoader, "
f"got {loader.__class__.__name__}"
)

if not isinstance(loader.rawinput, pd.DataFrame) or loader.rawinput.empty:
raise ValueError(
"Error in rawinput, consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader"
)

if not isinstance(loader.index_column, str):
raise ValueError(
"Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader"
)

def _check_matrix_values(self):
if np.isinf(self.mat).values.sum() > 0:
logging.warning("Data contains infinite values.")

def _remove_misc_samples_in_metadata(self):
samples_matrix = self.mat.index.to_list()
samples_metadata = self.metadata[self.sample].to_list()
misc_samples = list(set(samples_metadata) - set(samples_matrix))
if len(misc_samples) > 0:
self.metadata = self.metadata[
~self.metadata[self.sample].isin(misc_samples)
]
logging.warning(
f"{misc_samples} are not described in the protein data and"
"are removed from the metadata."
)

def _create_matrix(self):
"""
Creates a matrix of the Outputfile, with columns displaying features (Proteins) and
rows the samples.
"""

df = self.rawinput
df = df.set_index(self.index_column)

if isinstance(self.intensity_column, str):
regex_find_intensity_columns = self.intensity_column.replace(
"[sample]", ".*"
)
df = df.filter(regex=(regex_find_intensity_columns), axis=1)
# remove Intensity so only sample names remain
substring_to_remove = regex_find_intensity_columns.replace(".*", "")
df.columns = df.columns.str.replace(substring_to_remove, "")

else:
df = df[self.intensity_column]

# transpose dataframe
mat = df.transpose()
mat.replace([np.inf, -np.inf], np.nan, inplace=True)
self.rawmat = mat

# remove proteins with only zero # TODO this is re-done in preprocessing
mat_no_zeros = mat.loc[:, (mat != 0).any(axis=0)]
self.mat = mat_no_zeros.astype(float)

def _load_metadata(
self, file_path: Union[pd.DataFrame, str]
) -> Optional[pd.DataFrame]:
"""Load metadata either xlsx, txt, csv or txt file

Args:
file_path: path to metadata file or metadata DataFrame # TODO disentangle this
"""
if isinstance(file_path, pd.DataFrame):
df = file_path
elif file_path.endswith(".xlsx"):
warnings.filterwarnings(
"ignore",
category=UserWarning,
module="openpyxl",
# message=r"/extension is not supported and will be removed/", # this somehow does not work here?
)
df = pd.read_excel(file_path)
# find robust way to detect file format
# else give file separation as variable
elif file_path.endswith(".txt") or file_path.endswith(".tsv"):
df = pd.read_csv(file_path, delimiter="\t")
elif file_path.endswith(".csv"):
df = pd.read_csv(file_path)
else:
logging.warning(
"WARNING: Metadata could not be read. \nMetadata has to be a .xslx, .tsv, .csv or .txt file"
)
return None

if df is not None and self.sample not in df.columns:
logging.error(f"sample_column: {self.sample} not found in {file_path}")

# check whether sample labeling matches protein data
# warnings.warn("WARNING: Sample names do not match sample labelling in protein data")
df.columns = df.columns.astype(str)
return df
Loading
Loading