From 842f41833ff5e17f0007ea1b5f210cc388417582 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 18 Sep 2024 17:47:04 +0200 Subject: [PATCH] reorder and simlify methods in DataSet constructor --- alphastats/DataSet.py | 86 +++++++++++++++++--------------- tests/gui/test_02_import_data.py | 4 +- tests/test_DataSet.py | 4 +- 3 files changed, 49 insertions(+), 45 deletions(-) diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index 21715fcb..112a7a35 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -1,4 +1,4 @@ -from typing import List, Union, Dict, Optional +from typing import List, Union, Dict, Optional, Tuple import pandas as pd import numpy as np @@ -66,44 +66,49 @@ def __init__( """ self._check_loader(loader=loader) + # fill data from loader self.rawinput: pd.DataFrame = loader.rawinput - self.software: str = loader.software - self.index_column: str = loader.index_column - self.intensity_column: Union[str, list] = loader.intensity_column self.filter_columns: List[str] = loader.filter_columns - self.evidence_df: pd.DataFrame = loader.evidence_df - self.gene_names: str = loader.gene_names + self.index_column: str = loader.index_column + self.software: str = loader.software + self._gene_names: str = loader.gene_names + # TODO this is used when creating the matrix, but then overwritten for Generic loaders later? + self._intensity_column: Union[str, list] = loader.intensity_column - # include filtering before - self._create_matrix() - self._check_matrix_values() + # self.evidence_df: pd.DataFrame = loader.evidence_df # TODO unused + # create matrix + self.rawmat: pd.DataFrame + self.mat: pd.DataFrame + self.rawmat, self.mat = self._create_matrix_from_rawinput() + self._check_matrix_values(self.mat) + + # create metadata self.metadata: pd.DataFrame self.sample: str if metadata_path is not None: self.sample = sample_column - self.metadata = self._load_metadata(file_path=metadata_path) - self._remove_misc_samples_in_metadata() + metadata = self._load_metadata(file_path=metadata_path) + self.metadata = self._remove_misc_samples_in_metadata(metadata) else: self.sample = "sample" self.metadata = pd.DataFrame({"sample": list(self.mat.index)}) if loader == "Generic": - intensity_column = loader._extract_sample_names( + self._intensity_column = loader._extract_sample_names( metadata=self.metadata, sample_column=self.sample ) - self.intensity_column = intensity_column - # init preprocessing settings self.preprocessing_info: Dict = Preprocess.init_preprocessing_info( num_samples=self.mat.shape[0], num_protein_groups=self.mat.shape[1], - intensity_column=self.intensity_column, + intensity_column=self._intensity_column, filter_columns=self.filter_columns, ) - self.preprocessed = False - self.preprocessed: bool = False + self._preprocessed: bool = ( + False # TODO could be moved to preprocessing_info dict + ) print("DataSet has been created.") @@ -143,19 +148,19 @@ def preprocess( **kwargs, ) ) - self.preprocessed = True + self._preprocessed = True def reset_preprocessing(self): """Reset all preprocessing steps""" - self._create_matrix() + self.rawmat, self.mat = self._create_matrix_from_rawinput() self.preprocessing_info = Preprocess.init_preprocessing_info( num_samples=self.mat.shape[0], num_protein_groups=self.mat.shape[1], - intensity_column=self.intensity_column, + intensity_column=self._intensity_column, filter_columns=self.filter_columns, ) - self.preprocessed = False + self._preprocessed = False # TODO fix bug: metadata is not reset/reloaded here print("All preprocessing steps are reset.") @@ -340,7 +345,7 @@ def plot_volcano( metadata=self.metadata, sample=self.sample, index_column=self.index_column, - gene_names=self.gene_names, + gene_names=self._gene_names, preprocessing_info=self.preprocessing_info, group1=group1, group2=group2, @@ -392,7 +397,7 @@ def plot_intensity( mat=self.mat, metadata=self.metadata, sample=self.sample, - intensity_column=self.intensity_column, + intensity_column=self._intensity_column, preprocessing_info=self.preprocessing_info, protein_id=protein_id, group=group, @@ -500,24 +505,24 @@ def _check_loader(self, loader): "Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" ) - def _check_matrix_values(self): - if np.isinf(self.mat).values.sum() > 0: + @staticmethod + def _check_matrix_values(mat: pd.DataFrame) -> None: + if np.isinf(mat).values.sum() > 0: logging.warning("Data contains infinite values.") - def _remove_misc_samples_in_metadata(self): + def _remove_misc_samples_in_metadata(self, metadata: pd.DataFrame) -> pd.DataFrame: samples_matrix = self.mat.index.to_list() - samples_metadata = self.metadata[self.sample].to_list() + samples_metadata = metadata[self.sample].to_list() misc_samples = list(set(samples_metadata) - set(samples_matrix)) if len(misc_samples) > 0: - self.metadata = self.metadata[ - ~self.metadata[self.sample].isin(misc_samples) - ] + metadata = metadata[~metadata[self.sample].isin(misc_samples)] logging.warning( f"{misc_samples} are not described in the protein data and" "are removed from the metadata." ) + return metadata - def _create_matrix(self): + def _create_matrix_from_rawinput(self) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Creates a matrix of the Outputfile, with columns displaying features (Proteins) and rows the samples. @@ -526,26 +531,25 @@ def _create_matrix(self): df = self.rawinput df = df.set_index(self.index_column) - if isinstance(self.intensity_column, str): - regex_find_intensity_columns = self.intensity_column.replace( + if isinstance(self._intensity_column, str): + regex_find_intensity_columns = self._intensity_column.replace( "[sample]", ".*" ) - df = df.filter(regex=(regex_find_intensity_columns), axis=1) + df = df.filter(regex=regex_find_intensity_columns, axis=1) # remove Intensity so only sample names remain substring_to_remove = regex_find_intensity_columns.replace(".*", "") df.columns = df.columns.str.replace(substring_to_remove, "") else: - df = df[self.intensity_column] + df = df[self._intensity_column] - # transpose dataframe - mat = df.transpose() - mat.replace([np.inf, -np.inf], np.nan, inplace=True) - self.rawmat = mat + rawmat = df.transpose() + rawmat.replace([np.inf, -np.inf], np.nan, inplace=True) # remove proteins with only zero # TODO this is re-done in preprocessing - mat_no_zeros = mat.loc[:, (mat != 0).any(axis=0)] - self.mat = mat_no_zeros.astype(float) + mat_no_zeros = rawmat.loc[:, (rawmat != 0).any(axis=0)] + + return rawmat, mat_no_zeros.astype(float) def _load_metadata( self, file_path: Union[pd.DataFrame, str] diff --git a/tests/gui/test_02_import_data.py b/tests/gui/test_02_import_data.py index a1f85cc4..1caef40c 100644 --- a/tests/gui/test_02_import_data.py +++ b/tests/gui/test_02_import_data.py @@ -137,9 +137,9 @@ def test_page_02_loads_maxquant_testfiles( assert not at.exception dataset = at.session_state.dataset - assert dataset.gene_names == "Gene names" + assert dataset._gene_names == "Gene names" assert dataset.index_column == "Protein IDs" - assert dataset.intensity_column == "LFQ intensity [sample]" + assert dataset._intensity_column == "LFQ intensity [sample]" assert dataset.rawmat.shape == (312, 2611) assert dataset.software == "MaxQuant" assert dataset.sample == "sample" diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py index fdfa5ffe..73156c99 100644 --- a/tests/test_DataSet.py +++ b/tests/test_DataSet.py @@ -107,8 +107,8 @@ def test_check_values_warning(self, mock): "B": [23, 22, 24, 22, 25], "C": [66, 72, np.inf, 68, -np.inf], } - self.obj.mat = pd.DataFrame(data) - self.obj._check_matrix_values() + mat = pd.DataFrame(data) + self.obj._check_matrix_values(mat) mock.assert_called_once() @patch("logging.Logger.info")