Skip to content

Commit

Permalink
make some methods private, minor refactorings, get rid of overview()
Browse files Browse the repository at this point in the history
  • Loading branch information
mschwoer committed Sep 17, 2024
1 parent 27f055b commit b4b2346
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 25 deletions.
36 changes: 16 additions & 20 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ def __init__(
metadata_path (str, optional): path to metadata file. Defaults to None.
sample_column (str, optional): column in metadata file indicating the sample IDs. Defaults to None.
Attributes of a DataSet instance:
DataSet().rawinput: Raw Protein data.
DataSet().mat: Processed data matrix with ProteinIDs/ProteinGroups as columns and samples as rows. All computations are performed on this matrix.
DataSet().metadata: Metadata for the samples in the matrix. Metadata will be matched with DataSet().mat when needed (for instance Volcano Plot).
"""
self._check_loader(loader=loader)

Expand All @@ -64,14 +69,14 @@ def __init__(
self.gene_names: str = loader.gene_names

# include filtering before
self.create_matrix()
self._create_matrix()
self._check_matrix_values()

self.metadata: pd.DataFrame
self.sample: str
if metadata_path is not None:
self.sample = sample_column
self.metadata = self.load_metadata(file_path=metadata_path)
self.metadata = self._load_metadata(file_path=metadata_path)
self._remove_misc_samples_in_metadata()
else:
self.sample = "sample"
Expand All @@ -95,7 +100,6 @@ def __init__(
self.preprocessed: bool = False

print("DataSet has been created.")
self.overview()

def preprocess(
self,
Expand Down Expand Up @@ -133,7 +137,7 @@ def preprocess(

def reset_preprocessing(self):
"""Reset all preprocessing steps"""
self.create_matrix()
self._create_matrix()
self.preprocessing_info = Preprocess.init_preprocessing_info(
num_samples=self.mat.shape[0],
num_protein_groups=self.mat.shape[1],
Expand Down Expand Up @@ -207,7 +211,7 @@ def _subset(self):
)
return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())]

def create_matrix(self):
def _create_matrix(self):
"""
Creates a matrix of the Outputfile, with columns displaying features (Proteins) and
rows the samples.
Expand All @@ -233,12 +237,14 @@ def create_matrix(self):
mat.replace([np.inf, -np.inf], np.nan, inplace=True)

# remove proteins with only zero # TODO this is re-done in preprocessing
self.mat = mat.loc[:, (mat != 0).any(axis=0)]
self.mat = self.mat.astype(float)
mat = mat.loc[:, (mat != 0).any(axis=0)]

self.mat = mat.astype(float)
self.rawmat = mat

def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame:
def _load_metadata(
self, file_path: Union[pd.DataFrame, str]
) -> Optional[pd.DataFrame]:
"""Load metadata either xlsx, txt, csv or txt file
Args:
Expand All @@ -261,25 +267,15 @@ def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame:
elif file_path.endswith(".csv"):
df = pd.read_csv(file_path)
else:
df = None
logging.warning(
"WARNING: Metadata could not be read. \nMetadata has to be a .xslx, .tsv, .csv or .txt file"
)
return
return None

if df is not None and self.sample not in df.columns:
logging.error(f"sample_column: {self.sample} not found in {file_path}")

# check whether sample labeling matches protein data
# warnings.warn("WARNING: Sample names do not match sample labelling in protein data")
df.columns = df.columns.astype(str)
return df

def overview(self):
"""Print overview of the DataSet"""
dataset_overview = (
"Attributes of the DataSet can be accessed using: \n"
+ "DataSet.rawinput:\t Raw Protein data.\n"
+ "DataSet.mat:\t\tProcessed data matrix with ProteinIDs/ProteinGroups as columns and samples as rows. All computations are performed on this matrix.\n"
+ "DataSet.metadata:\tMetadata for the samples in the matrix. Metadata will be matched with DataSet.mat when needed (for instance Volcano Plot)."
)
print(dataset_overview)
10 changes: 5 additions & 5 deletions tests/test_DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,14 @@ def test_load_metadata_missing_sample_column(self, mock):
# is error raised when name of sample column is missing
path = self.metadata_path
self.obj.sample = "wrong_sample_column"
self.obj.load_metadata(file_path=path)
self.obj._load_metadata(file_path=path)
mock.assert_called_once()

@patch("logging.Logger.warning")
def test_load_metadata_warning(self, mock):
# is dataframe None and is warning produced
file_path = "wrong/file.xxx"
self.obj.load_metadata(file_path=file_path)
self.obj._load_metadata(file_path=file_path)
mock.assert_called_once()

def test_create_matrix(self):
Expand Down Expand Up @@ -221,15 +221,15 @@ def test_dataset_without_metadata(self):
def test_load_metadata_fileformats(self):
# test if different fileformats get loaded correctly
metadata_path = "testfiles/alphapept/metadata.txt"
self.obj.load_metadata(file_path=metadata_path)
self.obj._load_metadata(file_path=metadata_path)
self.assertEqual(self.obj.metadata.shape, (2, 2))

metadata_path = "testfiles/alphapept/metadata.tsv"
self.obj.load_metadata(file_path=metadata_path)
self.obj._load_metadata(file_path=metadata_path)
self.assertEqual(self.obj.metadata.shape, (2, 2))

metadata_path = "testfiles/alphapept/metadata.csv"
self.obj.load_metadata(file_path=metadata_path)
self.obj._load_metadata(file_path=metadata_path)
self.assertEqual(self.obj.metadata.shape, (2, 2))

@patch("logging.Logger.warning")
Expand Down

0 comments on commit b4b2346

Please sign in to comment.