diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index cacb743a..d9585b5e 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -52,6 +52,11 @@ def __init__( metadata_path (str, optional): path to metadata file. Defaults to None. sample_column (str, optional): column in metadata file indicating the sample IDs. Defaults to None. + Attributes of a DataSet instance: + DataSet().rawinput: Raw Protein data. + DataSet().mat: Processed data matrix with ProteinIDs/ProteinGroups as columns and samples as rows. All computations are performed on this matrix. + DataSet().metadata: Metadata for the samples in the matrix. Metadata will be matched with DataSet().mat when needed (for instance Volcano Plot). + """ self._check_loader(loader=loader) @@ -64,14 +69,14 @@ def __init__( self.gene_names: str = loader.gene_names # include filtering before - self.create_matrix() + self._create_matrix() self._check_matrix_values() self.metadata: pd.DataFrame self.sample: str if metadata_path is not None: self.sample = sample_column - self.metadata = self.load_metadata(file_path=metadata_path) + self.metadata = self._load_metadata(file_path=metadata_path) self._remove_misc_samples_in_metadata() else: self.sample = "sample" @@ -95,7 +100,6 @@ def __init__( self.preprocessed: bool = False print("DataSet has been created.") - self.overview() def preprocess( self, @@ -133,7 +137,7 @@ def preprocess( def reset_preprocessing(self): """Reset all preprocessing steps""" - self.create_matrix() + self._create_matrix() self.preprocessing_info = Preprocess.init_preprocessing_info( num_samples=self.mat.shape[0], num_protein_groups=self.mat.shape[1], @@ -207,7 +211,7 @@ def _subset(self): ) return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())] - def create_matrix(self): + def _create_matrix(self): """ Creates a matrix of the Outputfile, with columns displaying features (Proteins) and rows the samples. @@ -233,12 +237,14 @@ def create_matrix(self): mat.replace([np.inf, -np.inf], np.nan, inplace=True) # remove proteins with only zero # TODO this is re-done in preprocessing - self.mat = mat.loc[:, (mat != 0).any(axis=0)] - self.mat = self.mat.astype(float) + mat = mat.loc[:, (mat != 0).any(axis=0)] + self.mat = mat.astype(float) self.rawmat = mat - def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame: + def _load_metadata( + self, file_path: Union[pd.DataFrame, str] + ) -> Optional[pd.DataFrame]: """Load metadata either xlsx, txt, csv or txt file Args: @@ -261,11 +267,11 @@ def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame: elif file_path.endswith(".csv"): df = pd.read_csv(file_path) else: - df = None logging.warning( "WARNING: Metadata could not be read. \nMetadata has to be a .xslx, .tsv, .csv or .txt file" ) - return + return None + if df is not None and self.sample not in df.columns: logging.error(f"sample_column: {self.sample} not found in {file_path}") @@ -273,13 +279,3 @@ def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame: # warnings.warn("WARNING: Sample names do not match sample labelling in protein data") df.columns = df.columns.astype(str) return df - - def overview(self): - """Print overview of the DataSet""" - dataset_overview = ( - "Attributes of the DataSet can be accessed using: \n" - + "DataSet.rawinput:\t Raw Protein data.\n" - + "DataSet.mat:\t\tProcessed data matrix with ProteinIDs/ProteinGroups as columns and samples as rows. All computations are performed on this matrix.\n" - + "DataSet.metadata:\tMetadata for the samples in the matrix. Metadata will be matched with DataSet.mat when needed (for instance Volcano Plot)." - ) - print(dataset_overview) diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py index 127ae94d..5a75a07d 100644 --- a/tests/test_DataSet.py +++ b/tests/test_DataSet.py @@ -80,14 +80,14 @@ def test_load_metadata_missing_sample_column(self, mock): # is error raised when name of sample column is missing path = self.metadata_path self.obj.sample = "wrong_sample_column" - self.obj.load_metadata(file_path=path) + self.obj._load_metadata(file_path=path) mock.assert_called_once() @patch("logging.Logger.warning") def test_load_metadata_warning(self, mock): # is dataframe None and is warning produced file_path = "wrong/file.xxx" - self.obj.load_metadata(file_path=file_path) + self.obj._load_metadata(file_path=file_path) mock.assert_called_once() def test_create_matrix(self): @@ -221,15 +221,15 @@ def test_dataset_without_metadata(self): def test_load_metadata_fileformats(self): # test if different fileformats get loaded correctly metadata_path = "testfiles/alphapept/metadata.txt" - self.obj.load_metadata(file_path=metadata_path) + self.obj._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) metadata_path = "testfiles/alphapept/metadata.tsv" - self.obj.load_metadata(file_path=metadata_path) + self.obj._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) metadata_path = "testfiles/alphapept/metadata.csv" - self.obj.load_metadata(file_path=metadata_path) + self.obj._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) @patch("logging.Logger.warning")