Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

move initialization of preprocessing to Preprocess, change initializa… #325

Merged
merged 3 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 35 additions & 47 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ def __init__(
metadata_path (str, optional): path to metadata file. Defaults to None.
sample_column (str, optional): column in metadata file indicating the sample IDs. Defaults to None.

Attributes of a DataSet instance:
DataSet().rawinput: Raw Protein data.
DataSet().mat: Processed data matrix with ProteinIDs/ProteinGroups as columns and samples as rows. All computations are performed on this matrix.
DataSet().metadata: Metadata for the samples in the matrix. Metadata will be matched with DataSet().mat when needed (for instance Volcano Plot).

"""
self._check_loader(loader=loader)

Expand All @@ -64,14 +69,14 @@ def __init__(
self.gene_names: str = loader.gene_names

# include filtering before
self.create_matrix()
self._create_matrix()
self._check_matrix_values()

self.metadata: pd.DataFrame
self.sample: str
if metadata_path is not None:
self.sample = sample_column
self.metadata = self.load_metadata(file_path=metadata_path)
self.metadata = self._load_metadata(file_path=metadata_path)
self._remove_misc_samples_in_metadata()
else:
self.sample = "sample"
Expand All @@ -83,12 +88,18 @@ def __init__(
)
self.intensity_column = intensity_column

# save preprocessing settings
self.preprocessing_info: Dict = self._save_dataset_info()
# init preprocessing settings
self.preprocessing_info: Dict = Preprocess.init_preprocessing_info(
num_samples=self.mat.shape[0],
num_protein_groups=self.mat.shape[1],
intensity_column=self.intensity_column,
filter_columns=self.filter_columns,
)

self.preprocessed = False
self.preprocessed: bool = False

print("DataSet has been created.")
self.overview()

def preprocess(
self,
Expand Down Expand Up @@ -126,8 +137,16 @@ def preprocess(

def reset_preprocessing(self):
"""Reset all preprocessing steps"""
self.create_matrix()
# TODO fix bug: metadata is not reset here
self._create_matrix()
self.preprocessing_info = Preprocess.init_preprocessing_info(
num_samples=self.mat.shape[0],
num_protein_groups=self.mat.shape[1],
intensity_column=self.intensity_column,
filter_columns=self.filter_columns,
)

self.preprocessed = False
# TODO fix bug: metadata is not reset/reloaded here
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
print("All preprocessing steps are reset.")

def batch_correction(self, batch: str) -> None:
Expand Down Expand Up @@ -192,7 +211,7 @@ def _subset(self):
)
return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())]

def create_matrix(self):
def _create_matrix(self):
"""
Creates a matrix of the Outputfile, with columns displaying features (Proteins) and
rows the samples.
Expand All @@ -216,17 +235,15 @@ def create_matrix(self):
# transpose dataframe
mat = df.transpose()
mat.replace([np.inf, -np.inf], np.nan, inplace=True)
self.rawmat = mat

# remove proteins with only zero # TODO this is re-done in preprocessing
self.mat = mat.loc[:, (mat != 0).any(axis=0)]
self.mat = self.mat.astype(float)

# reset preproccessing info
self.preprocessing_info = self._save_dataset_info()
self.preprocessed = False
self.rawmat = mat
mat_no_zeros = mat.loc[:, (mat != 0).any(axis=0)]
self.mat = mat_no_zeros.astype(float)

def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame:
def _load_metadata(
self, file_path: Union[pd.DataFrame, str]
) -> Optional[pd.DataFrame]:
"""Load metadata either xlsx, txt, csv or txt file

Args:
Expand All @@ -249,44 +266,15 @@ def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame:
elif file_path.endswith(".csv"):
df = pd.read_csv(file_path)
else:
df = None
logging.warning(
"WARNING: Metadata could not be read. \nMetadata has to be a .xslx, .tsv, .csv or .txt file"
)
return
return None

if df is not None and self.sample not in df.columns:
logging.error(f"sample_column: {self.sample} not found in {file_path}")

# check whether sample labeling matches protein data
# warnings.warn("WARNING: Sample names do not match sample labelling in protein data")
df.columns = df.columns.astype(str)
return df

def _save_dataset_info(self):
n_proteingroups = self.mat.shape[1]
preprocessing_dict = {
PreprocessingStateKeys.RAW_DATA_NUM_PG: n_proteingroups,
PreprocessingStateKeys.NUM_PG: self.mat.shape[1],
PreprocessingStateKeys.NUM_SAMPLES: self.mat.shape[0],
PreprocessingStateKeys.INTENSITY_COLUMN: self.intensity_column,
PreprocessingStateKeys.LOG2_TRANSFORMED: False,
PreprocessingStateKeys.NORMALIZATION: None,
PreprocessingStateKeys.IMPUTATION: None,
PreprocessingStateKeys.CONTAMINATIONS_REMOVED: False,
PreprocessingStateKeys.CONTAMINATION_COLUMNS: self.filter_columns,
PreprocessingStateKeys.NUM_REMOVED_PG_DUE_TO_CONTAMINATION: 0,
PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: 0,
PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: 0,
PreprocessingStateKeys.MISSING_VALUES_REMOVED: False,
}
return preprocessing_dict

def overview(self):
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
"""Print overview of the DataSet"""
dataset_overview = (
"Attributes of the DataSet can be accessed using: \n"
+ "DataSet.rawinput:\t Raw Protein data.\n"
+ "DataSet.mat:\t\tProcessed data matrix with ProteinIDs/ProteinGroups as columns and samples as rows. All computations are performed on this matrix.\n"
+ "DataSet.metadata:\tMetadata for the samples in the matrix. Metadata will be matched with DataSet.mat when needed (for instance Volcano Plot)."
)
print(dataset_overview)
24 changes: 24 additions & 0 deletions alphastats/DataSet_Preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,30 @@ def __init__(
self.preprocessing_info = preprocessing_info # changed
self.mat = mat # changed

@staticmethod
def init_preprocessing_info(
num_samples: int,
num_protein_groups: int,
intensity_column: str,
filter_columns: List[str],
) -> Dict:
"""Initialize preprocessing info."""
return {
PreprocessingStateKeys.RAW_DATA_NUM_PG: num_protein_groups,
PreprocessingStateKeys.NUM_PG: num_protein_groups,
PreprocessingStateKeys.NUM_SAMPLES: num_samples,
PreprocessingStateKeys.INTENSITY_COLUMN: intensity_column,
PreprocessingStateKeys.LOG2_TRANSFORMED: False,
PreprocessingStateKeys.NORMALIZATION: None,
PreprocessingStateKeys.IMPUTATION: None,
PreprocessingStateKeys.CONTAMINATIONS_REMOVED: False,
PreprocessingStateKeys.CONTAMINATION_COLUMNS: filter_columns,
PreprocessingStateKeys.NUM_REMOVED_PG_DUE_TO_CONTAMINATION: 0,
PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: 0,
PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: 0,
PreprocessingStateKeys.MISSING_VALUES_REMOVED: False,
}

def _remove_samples(self, sample_list: list):
# exclude samples for analysis
self.mat = self.mat.drop(sample_list)
Expand Down
3 changes: 1 addition & 2 deletions alphastats/gui/utils/preprocessing_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,6 @@ def display_preprocessing_info(preprocessing_info):
)


# TODO: cache this
def reset_preprocessing(dataset: DataSet) -> None:
"""Reset the preprocessing of the dataset.

Expand All @@ -296,5 +295,5 @@ def reset_preprocessing(dataset: DataSet) -> None:
None
"""

dataset.create_matrix()
dataset.reset_preprocessing()
st.info("Preprocessing has been reset.")
10 changes: 5 additions & 5 deletions tests/test_DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,14 @@ def test_load_metadata_missing_sample_column(self, mock):
# is error raised when name of sample column is missing
path = self.metadata_path
self.obj.sample = "wrong_sample_column"
self.obj.load_metadata(file_path=path)
self.obj._load_metadata(file_path=path)
mock.assert_called_once()

@patch("logging.Logger.warning")
def test_load_metadata_warning(self, mock):
# is dataframe None and is warning produced
file_path = "wrong/file.xxx"
self.obj.load_metadata(file_path=file_path)
self.obj._load_metadata(file_path=file_path)
mock.assert_called_once()

def test_create_matrix(self):
Expand Down Expand Up @@ -221,15 +221,15 @@ def test_dataset_without_metadata(self):
def test_load_metadata_fileformats(self):
# test if different fileformats get loaded correctly
metadata_path = "testfiles/alphapept/metadata.txt"
self.obj.load_metadata(file_path=metadata_path)
self.obj._load_metadata(file_path=metadata_path)
self.assertEqual(self.obj.metadata.shape, (2, 2))

metadata_path = "testfiles/alphapept/metadata.tsv"
self.obj.load_metadata(file_path=metadata_path)
self.obj._load_metadata(file_path=metadata_path)
self.assertEqual(self.obj.metadata.shape, (2, 2))

metadata_path = "testfiles/alphapept/metadata.csv"
self.obj.load_metadata(file_path=metadata_path)
self.obj._load_metadata(file_path=metadata_path)
self.assertEqual(self.obj.metadata.shape, (2, 2))

@patch("logging.Logger.warning")
Expand Down
Loading