Skip to content

Commit

Permalink
introduce datasetfactory
Browse files Browse the repository at this point in the history
  • Loading branch information
mschwoer committed Sep 18, 2024
1 parent 842f418 commit 3abea3e
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 106 deletions.
106 changes: 0 additions & 106 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,109 +482,3 @@ def plot_dendrogram(
):
"""A wrapper for Plot.plot_dendrogram(), see documentation there."""
return self._get_plot().plot_dendrogram(linkagefun)

def _check_loader(self, loader):
"""Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader
Args:
loader : loader
"""
if not isinstance(loader, BaseLoader):
raise LoaderError(
"loader must be a subclass of BaseLoader, "
f"got {loader.__class__.__name__}"
)

if not isinstance(loader.rawinput, pd.DataFrame) or loader.rawinput.empty:
raise ValueError(
"Error in rawinput, consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader"
)

if not isinstance(loader.index_column, str):
raise ValueError(
"Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader"
)

@staticmethod
def _check_matrix_values(mat: pd.DataFrame) -> None:
if np.isinf(mat).values.sum() > 0:
logging.warning("Data contains infinite values.")

def _remove_misc_samples_in_metadata(self, metadata: pd.DataFrame) -> pd.DataFrame:
samples_matrix = self.mat.index.to_list()
samples_metadata = metadata[self.sample].to_list()
misc_samples = list(set(samples_metadata) - set(samples_matrix))
if len(misc_samples) > 0:
metadata = metadata[~metadata[self.sample].isin(misc_samples)]
logging.warning(
f"{misc_samples} are not described in the protein data and"
"are removed from the metadata."
)
return metadata

def _create_matrix_from_rawinput(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Creates a matrix of the Outputfile, with columns displaying features (Proteins) and
rows the samples.
"""

df = self.rawinput
df = df.set_index(self.index_column)

if isinstance(self._intensity_column, str):
regex_find_intensity_columns = self._intensity_column.replace(
"[sample]", ".*"
)
df = df.filter(regex=regex_find_intensity_columns, axis=1)
# remove Intensity so only sample names remain
substring_to_remove = regex_find_intensity_columns.replace(".*", "")
df.columns = df.columns.str.replace(substring_to_remove, "")

else:
df = df[self._intensity_column]

rawmat = df.transpose()
rawmat.replace([np.inf, -np.inf], np.nan, inplace=True)

# remove proteins with only zero # TODO this is re-done in preprocessing
mat_no_zeros = rawmat.loc[:, (rawmat != 0).any(axis=0)]

return rawmat, mat_no_zeros.astype(float)

def _load_metadata(
self, file_path: Union[pd.DataFrame, str]
) -> Optional[pd.DataFrame]:
"""Load metadata either xlsx, txt, csv or txt file
Args:
file_path: path to metadata file or metadata DataFrame # TODO disentangle this
"""
if isinstance(file_path, pd.DataFrame):
df = file_path
elif file_path.endswith(".xlsx"):
warnings.filterwarnings(
"ignore",
category=UserWarning,
module="openpyxl",
# message=r"/extension is not supported and will be removed/", # this somehow does not work here?
)
df = pd.read_excel(file_path)
# find robust way to detect file format
# else give file separation as variable
elif file_path.endswith(".txt") or file_path.endswith(".tsv"):
df = pd.read_csv(file_path, delimiter="\t")
elif file_path.endswith(".csv"):
df = pd.read_csv(file_path)
else:
logging.warning(
"WARNING: Metadata could not be read. \nMetadata has to be a .xslx, .tsv, .csv or .txt file"
)
return None

if df is not None and self.sample not in df.columns:
logging.error(f"sample_column: {self.sample} not found in {file_path}")

# check whether sample labeling matches protein data
# warnings.warn("WARNING: Sample names do not match sample labelling in protein data")
df.columns = df.columns.astype(str)
return df
127 changes: 127 additions & 0 deletions alphastats/dataset_factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from typing import List, Union, Dict, Optional, Tuple

import pandas as pd
import numpy as np
import logging
import warnings


class DataSetFactory:
"""Create all 'heavy' data structures of a DataSet."""

def __init__(
self,
*,
rawinput: pd.DataFrame,
index_column: str,
intensity_column: Union[List[str], str],
metadata_path: Union[str, pd.DataFrame],
sample_column: str,
):
self.rawinput: pd.DataFrame = rawinput
self.sample_column: str = sample_column
self.index_column: str = index_column
self.intensity_column: Union[List[str], str] = intensity_column
self.metadata_path: Union[str, pd.DataFrame] = metadata_path

def create_matrix_from_rawinput(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Creates a matrix: features (Proteins) as columns, samples as rows."""

df = self.rawinput
df = df.set_index(self.index_column)

if isinstance(self.intensity_column, str):
regex_find_intensity_columns = self.intensity_column.replace(
"[sample]", ".*"
)
df = df.filter(regex=regex_find_intensity_columns, axis=1)
# remove Intensity so only sample names remain
substring_to_remove = regex_find_intensity_columns.replace(".*", "")
df.columns = df.columns.str.replace(substring_to_remove, "")

else:
df = df[self.intensity_column]

rawmat = df.transpose()
rawmat.replace([np.inf, -np.inf], np.nan, inplace=True)

# remove proteins with only zero # TODO this is re-done in preprocessing
mat_no_zeros = rawmat.loc[:, (rawmat != 0).any(axis=0)].astype(float)

self._check_matrix_values(mat_no_zeros)

return rawmat, mat_no_zeros

@staticmethod
def _check_matrix_values(mat: pd.DataFrame) -> None:
"""Check for infinite values in the matrix."""
if np.isinf(mat).values.sum() > 0:
logging.warning("Data contains infinite values.")

def create_metadata(self, mat: pd.DataFrame) -> Tuple[pd.DataFrame, str]:
"""Create metadata DataFrame from metadata file or DataFrame."""

if self.metadata_path is not None:
sample = self.sample_column
metadata = self._load_metadata(file_path=self.metadata_path)
metadata = self._remove_missing_samples_from_metadata(mat, metadata, sample)
else:
sample = "sample"
metadata = pd.DataFrame({"sample": list(mat.index)})

return metadata, sample

def _remove_missing_samples_from_metadata(
self, mat: pd.DataFrame, metadata: pd.DataFrame, sample
) -> pd.DataFrame:
"""Remove samples from metadata that are not in the protein data."""
samples_matrix = mat.index.to_list()
samples_metadata = metadata[sample].to_list()
misc_samples = list(set(samples_metadata) - set(samples_matrix))
if len(misc_samples) > 0:
metadata = metadata[~metadata[sample].isin(misc_samples)]
logging.warning(
f"{misc_samples} are not described in the protein data and"
"are removed from the metadata."
)
return metadata

def _load_metadata(
self, file_path: Union[pd.DataFrame, str]
) -> Optional[pd.DataFrame]:
"""Load metadata either xlsx, txt, csv or txt file
Args:
file_path: path to metadata file or metadata DataFrame # TODO disentangle this
"""
if isinstance(file_path, pd.DataFrame):
df = file_path
elif file_path.endswith(".xlsx"):
warnings.filterwarnings(
"ignore",
category=UserWarning,
module="openpyxl",
# message=r"/extension is not supported and will be removed/", # this somehow does not work here?
)
df = pd.read_excel(file_path)
# find robust way to detect file format
# else give file separation as variable
elif file_path.endswith(".txt") or file_path.endswith(".tsv"):
df = pd.read_csv(file_path, delimiter="\t")
elif file_path.endswith(".csv"):
df = pd.read_csv(file_path)
else:
logging.warning(
"WARNING: Metadata could not be read. \nMetadata has to be a .xslx, .tsv, .csv or .txt file"
)
return None

if df is not None and self.sample_column not in df.columns:
logging.error(
f"sample_column: {self.sample_column} not found in {file_path}"
)

# check whether sample labeling matches protein data
# warnings.warn("WARNING: Sample names do not match sample labelling in protein data")
df.columns = df.columns.astype(str)
return df

0 comments on commit 3abea3e

Please sign in to comment.