diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index 5a77dfa8..f1387c8e 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -52,6 +52,11 @@ def __init__( metadata_path (str, optional): path to metadata file. Defaults to None. sample_column (str, optional): column in metadata file indicating the sample IDs. Defaults to None. + Attributes of a DataSet instance: + DataSet().rawinput: Raw Protein data. + DataSet().mat: Processed data matrix with ProteinIDs/ProteinGroups as columns and samples as rows. All computations are performed on this matrix. + DataSet().metadata: Metadata for the samples in the matrix. Metadata will be matched with DataSet().mat when needed (for instance Volcano Plot). + """ self._check_loader(loader=loader) @@ -64,14 +69,14 @@ def __init__( self.gene_names: str = loader.gene_names # include filtering before - self.create_matrix() + self._create_matrix() self._check_matrix_values() self.metadata: pd.DataFrame self.sample: str if metadata_path is not None: self.sample = sample_column - self.metadata = self.load_metadata(file_path=metadata_path) + self.metadata = self._load_metadata(file_path=metadata_path) self._remove_misc_samples_in_metadata() else: self.sample = "sample" @@ -83,12 +88,18 @@ def __init__( ) self.intensity_column = intensity_column - # save preprocessing settings - self.preprocessing_info: Dict = self._save_dataset_info() + # init preprocessing settings + self.preprocessing_info: Dict = Preprocess.init_preprocessing_info( + num_samples=self.mat.shape[0], + num_protein_groups=self.mat.shape[1], + intensity_column=self.intensity_column, + filter_columns=self.filter_columns, + ) + + self.preprocessed = False self.preprocessed: bool = False print("DataSet has been created.") - self.overview() def preprocess( self, @@ -126,8 +137,16 @@ def preprocess( def reset_preprocessing(self): """Reset all preprocessing steps""" - self.create_matrix() - # TODO fix bug: metadata is not reset here + self._create_matrix() + self.preprocessing_info = Preprocess.init_preprocessing_info( + num_samples=self.mat.shape[0], + num_protein_groups=self.mat.shape[1], + intensity_column=self.intensity_column, + filter_columns=self.filter_columns, + ) + + self.preprocessed = False + # TODO fix bug: metadata is not reset/reloaded here print("All preprocessing steps are reset.") def batch_correction(self, batch: str) -> None: @@ -192,7 +211,7 @@ def _subset(self): ) return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())] - def create_matrix(self): + def _create_matrix(self): """ Creates a matrix of the Outputfile, with columns displaying features (Proteins) and rows the samples. @@ -216,17 +235,15 @@ def create_matrix(self): # transpose dataframe mat = df.transpose() mat.replace([np.inf, -np.inf], np.nan, inplace=True) + self.rawmat = mat # remove proteins with only zero # TODO this is re-done in preprocessing - self.mat = mat.loc[:, (mat != 0).any(axis=0)] - self.mat = self.mat.astype(float) - - # reset preproccessing info - self.preprocessing_info = self._save_dataset_info() - self.preprocessed = False - self.rawmat = mat + mat_no_zeros = mat.loc[:, (mat != 0).any(axis=0)] + self.mat = mat_no_zeros.astype(float) - def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame: + def _load_metadata( + self, file_path: Union[pd.DataFrame, str] + ) -> Optional[pd.DataFrame]: """Load metadata either xlsx, txt, csv or txt file Args: @@ -249,11 +266,11 @@ def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame: elif file_path.endswith(".csv"): df = pd.read_csv(file_path) else: - df = None logging.warning( "WARNING: Metadata could not be read. \nMetadata has to be a .xslx, .tsv, .csv or .txt file" ) - return + return None + if df is not None and self.sample not in df.columns: logging.error(f"sample_column: {self.sample} not found in {file_path}") @@ -261,32 +278,3 @@ def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame: # warnings.warn("WARNING: Sample names do not match sample labelling in protein data") df.columns = df.columns.astype(str) return df - - def _save_dataset_info(self): - n_proteingroups = self.mat.shape[1] - preprocessing_dict = { - PreprocessingStateKeys.RAW_DATA_NUM_PG: n_proteingroups, - PreprocessingStateKeys.NUM_PG: self.mat.shape[1], - PreprocessingStateKeys.NUM_SAMPLES: self.mat.shape[0], - PreprocessingStateKeys.INTENSITY_COLUMN: self.intensity_column, - PreprocessingStateKeys.LOG2_TRANSFORMED: False, - PreprocessingStateKeys.NORMALIZATION: None, - PreprocessingStateKeys.IMPUTATION: None, - PreprocessingStateKeys.CONTAMINATIONS_REMOVED: False, - PreprocessingStateKeys.CONTAMINATION_COLUMNS: self.filter_columns, - PreprocessingStateKeys.NUM_REMOVED_PG_DUE_TO_CONTAMINATION: 0, - PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: 0, - PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: 0, - PreprocessingStateKeys.MISSING_VALUES_REMOVED: False, - } - return preprocessing_dict - - def overview(self): - """Print overview of the DataSet""" - dataset_overview = ( - "Attributes of the DataSet can be accessed using: \n" - + "DataSet.rawinput:\t Raw Protein data.\n" - + "DataSet.mat:\t\tProcessed data matrix with ProteinIDs/ProteinGroups as columns and samples as rows. All computations are performed on this matrix.\n" - + "DataSet.metadata:\tMetadata for the samples in the matrix. Metadata will be matched with DataSet.mat when needed (for instance Volcano Plot)." - ) - print(dataset_overview) diff --git a/alphastats/DataSet_Preprocess.py b/alphastats/DataSet_Preprocess.py index 107b1745..f1e21ddd 100644 --- a/alphastats/DataSet_Preprocess.py +++ b/alphastats/DataSet_Preprocess.py @@ -59,6 +59,30 @@ def __init__( self.preprocessing_info = preprocessing_info # changed self.mat = mat # changed + @staticmethod + def init_preprocessing_info( + num_samples: int, + num_protein_groups: int, + intensity_column: str, + filter_columns: List[str], + ) -> Dict: + """Initialize preprocessing info.""" + return { + PreprocessingStateKeys.RAW_DATA_NUM_PG: num_protein_groups, + PreprocessingStateKeys.NUM_PG: num_protein_groups, + PreprocessingStateKeys.NUM_SAMPLES: num_samples, + PreprocessingStateKeys.INTENSITY_COLUMN: intensity_column, + PreprocessingStateKeys.LOG2_TRANSFORMED: False, + PreprocessingStateKeys.NORMALIZATION: None, + PreprocessingStateKeys.IMPUTATION: None, + PreprocessingStateKeys.CONTAMINATIONS_REMOVED: False, + PreprocessingStateKeys.CONTAMINATION_COLUMNS: filter_columns, + PreprocessingStateKeys.NUM_REMOVED_PG_DUE_TO_CONTAMINATION: 0, + PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: 0, + PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: 0, + PreprocessingStateKeys.MISSING_VALUES_REMOVED: False, + } + def _remove_samples(self, sample_list: list): # exclude samples for analysis self.mat = self.mat.drop(sample_list) diff --git a/alphastats/gui/utils/preprocessing_helper.py b/alphastats/gui/utils/preprocessing_helper.py index da9e53d7..69e621ad 100644 --- a/alphastats/gui/utils/preprocessing_helper.py +++ b/alphastats/gui/utils/preprocessing_helper.py @@ -285,7 +285,6 @@ def display_preprocessing_info(preprocessing_info): ) -# TODO: cache this def reset_preprocessing(dataset: DataSet) -> None: """Reset the preprocessing of the dataset. @@ -296,5 +295,5 @@ def reset_preprocessing(dataset: DataSet) -> None: None """ - dataset.create_matrix() + dataset.reset_preprocessing() st.info("Preprocessing has been reset.") diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py index 127ae94d..5a75a07d 100644 --- a/tests/test_DataSet.py +++ b/tests/test_DataSet.py @@ -80,14 +80,14 @@ def test_load_metadata_missing_sample_column(self, mock): # is error raised when name of sample column is missing path = self.metadata_path self.obj.sample = "wrong_sample_column" - self.obj.load_metadata(file_path=path) + self.obj._load_metadata(file_path=path) mock.assert_called_once() @patch("logging.Logger.warning") def test_load_metadata_warning(self, mock): # is dataframe None and is warning produced file_path = "wrong/file.xxx" - self.obj.load_metadata(file_path=file_path) + self.obj._load_metadata(file_path=file_path) mock.assert_called_once() def test_create_matrix(self): @@ -221,15 +221,15 @@ def test_dataset_without_metadata(self): def test_load_metadata_fileformats(self): # test if different fileformats get loaded correctly metadata_path = "testfiles/alphapept/metadata.txt" - self.obj.load_metadata(file_path=metadata_path) + self.obj._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) metadata_path = "testfiles/alphapept/metadata.tsv" - self.obj.load_metadata(file_path=metadata_path) + self.obj._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) metadata_path = "testfiles/alphapept/metadata.csv" - self.obj.load_metadata(file_path=metadata_path) + self.obj._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) @patch("logging.Logger.warning")