Skip to content

Commit

Permalink
Merge pull request #345 from MannLabs/0-value-handling
Browse files Browse the repository at this point in the history
0 value handling
  • Loading branch information
mschwoer authored Nov 18, 2024
2 parents 5664401 + fccd60f commit f8f3ce2
Show file tree
Hide file tree
Showing 13 changed files with 197 additions and 41 deletions.
6 changes: 5 additions & 1 deletion alphastats/DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
import plotly
import scipy

from alphastats import BaseLoader
from alphastats.dataset_factory import DataSetFactory
from alphastats.dataset_harmonizer import DataHarmonizer
from alphastats.DataSet_Plot import Plot
from alphastats.DataSet_Preprocess import Preprocess
from alphastats.DataSet_Statistics import Statistics
from alphastats.keys import Cols
from alphastats.loader.BaseLoader import BaseLoader
from alphastats.plots.ClusterMap import ClusterMap
from alphastats.plots.DimensionalityReduction import DimensionalityReduction
from alphastats.plots.IntensityPlot import IntensityPlot
Expand Down Expand Up @@ -167,10 +167,12 @@ def preprocess(
log2_transform: bool = False,
remove_contaminations: bool = False,
subset: bool = False,
replace_zeroes: bool = False,
data_completeness: float = 0,
normalization: str = None,
imputation: str = None,
remove_samples: list = None,
drop_unmeasured_features: bool = False,
**kwargs,
) -> None:
"""A wrapper for Preprocess.preprocess(), see documentation there."""
Expand All @@ -179,10 +181,12 @@ def preprocess(
log2_transform,
remove_contaminations,
subset,
replace_zeroes,
data_completeness,
normalization,
imputation,
remove_samples,
drop_unmeasured_features,
**kwargs,
)
)
Expand Down
29 changes: 25 additions & 4 deletions alphastats/DataSet_Preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class PreprocessingStateKeys(metaclass=ConstantsClass):
NUM_PG = "Matrix= Number of ProteinIDs/ProteinGroups"
NUM_SAMPLES = "Matrix= Number of samples"
INTENSITY_COLUMN = "Intensity used for analysis"
REPLACE_ZEROES = "Replace zero values with nan"
LOG2_TRANSFORMED = "Log2-transformed"
NORMALIZATION = "Normalization"
IMPUTATION = "Imputation"
Expand All @@ -36,6 +37,7 @@ class PreprocessingStateKeys(metaclass=ConstantsClass):
"Number of removed ProteinGroups due to data completeness cutoff"
)
MISSING_VALUES_REMOVED = "Missing values were removed"
DROP_UNMEASURED_FEATURES = "Drop unmeasured features"


class Preprocess:
Expand Down Expand Up @@ -72,6 +74,7 @@ def init_preprocessing_info(
PreprocessingStateKeys.NUM_PG: num_protein_groups,
PreprocessingStateKeys.NUM_SAMPLES: num_samples,
PreprocessingStateKeys.INTENSITY_COLUMN: intensity_column,
PreprocessingStateKeys.REPLACE_ZEROES: False,
PreprocessingStateKeys.LOG2_TRANSFORMED: False,
PreprocessingStateKeys.NORMALIZATION: None,
PreprocessingStateKeys.IMPUTATION: None,
Expand All @@ -81,6 +84,7 @@ def init_preprocessing_info(
PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: 0,
PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: 0,
PreprocessingStateKeys.MISSING_VALUES_REMOVED: False,
PreprocessingStateKeys.DROP_UNMEASURED_FEATURES: False,
}

def _remove_samples(self, sample_list: list):
Expand Down Expand Up @@ -117,7 +121,6 @@ def _remove_na_values(self, cut_off):
num_samples, num_proteins = self.mat.shape
limit = num_samples * cut

self.mat.replace(0, np.nan, inplace=True)
keep_list = list()
invalid = 0
for column_name in self.mat.columns:
Expand Down Expand Up @@ -331,6 +334,8 @@ def _normalization(self, method: str) -> None:

def _log2_transform(self):
self.mat = np.log2(self.mat)
self.mat = self.mat.replace([np.inf, -np.inf], np.nan)
# TODO: Ideally we wouldn't need to replace infs if all downstream methods can handle them
self.preprocessing_info.update({PreprocessingStateKeys.LOG2_TRANSFORMED: True})
print("Data has been log2-transformed.")

Expand Down Expand Up @@ -362,10 +367,12 @@ def preprocess(
log2_transform: bool = False,
remove_contaminations: bool = False,
subset: bool = False,
replace_zeroes: bool = False,
data_completeness: float = 0,
normalization: str = None,
imputation: str = None,
remove_samples: list = None,
drop_unmeasured_features: bool = False,
**kwargs,
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
"""Preprocess Protein data
Expand Down Expand Up @@ -427,6 +434,14 @@ def preprocess(
if subset:
self.mat = self.subset(self.mat, self.metadata, self.preprocessing_info)

if replace_zeroes:
self.mat = self.mat.replace(0, np.nan)
self.preprocessing_info.update(
{
PreprocessingStateKeys.REPLACE_ZEROES: True,
}
)

if data_completeness > 0:
self._remove_na_values(cut_off=data_completeness)

Expand All @@ -444,9 +459,15 @@ def preprocess(
if imputation is not None:
self._imputation(method=imputation)

# TODO should this step be optional, too? is also done in create_matrix
# for now, add it to `preprocessing_info`
self.mat = self.mat.loc[:, (self.mat != 0).any(axis=0)]
if drop_unmeasured_features:
n = self.mat.shape[1]
self.mat = self.mat.loc[:, np.isfinite(self.mat).any(axis=0)]
self.preprocessing_info.update(
{
PreprocessingStateKeys.DROP_UNMEASURED_FEATURES: n
- self.mat.shape[1],
}
)

self.preprocessing_info.update(
{
Expand Down
7 changes: 2 additions & 5 deletions alphastats/dataset_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,9 @@ def create_matrix_from_rawinput(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
rawmat = df.transpose()
rawmat.replace([np.inf, -np.inf], np.nan, inplace=True)

# remove proteins with only zero # TODO this is re-done in preprocessing
mat_no_zeros = rawmat.loc[:, (rawmat != 0).any(axis=0)].astype(float)
self._check_matrix_values(rawmat)

self._check_matrix_values(mat_no_zeros)

return rawmat, mat_no_zeros
return rawmat, rawmat

@staticmethod
def _check_matrix_values(mat: pd.DataFrame) -> None:
Expand Down
2 changes: 1 addition & 1 deletion alphastats/dataset_harmonizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import pandas as pd

from alphastats import BaseLoader
from alphastats.keys import Cols
from alphastats.loader.BaseLoader import BaseLoader


class DataHarmonizer:
Expand Down
4 changes: 2 additions & 2 deletions alphastats/gui/utils/overview_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ def display_loaded_dataset(dataset: DataSet) -> None:
st.markdown("*Preview:* Matrix")

df = pd.DataFrame(
dataset.mat.values,
index=dataset.mat.index.to_list(),
dataset.rawmat.values,
index=dataset.rawmat.index.to_list(),
).head(5)

st.dataframe(df)
38 changes: 31 additions & 7 deletions alphastats/gui/utils/preprocessing_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,12 @@ class PREPROCESSING_STEPS:
REMOVE_CONTAMINATIONS = "remove_contaminations"
REMOVE_SAMPLES = "remove_samples"
SUBSET = "subset"
REPLACE_ZEROES = "replace_zeroes"
DATA_COMPLETENESS = "data_completeness"
LOG2_TRANSFORM = "log2_transform"
NORMALIZATION = "normalization"
IMPUTATION = "imputation"
DROP_UNMEASURED_FEATURES = "drop_unmeasured_features"
BATCH = "batch"


Expand All @@ -77,6 +79,10 @@ class PREPROCESSING_STEPS:
"repr": "Subset data",
"help": "Subset data so it matches with metadata. Can for example be useful if several dimensions of an experiment were analysed together.",
},
PREPROCESSING_STEPS.REPLACE_ZEROES: {
"repr": "0 --> NaN",
"help": "Replace 0 in the data with NaN.",
},
PREPROCESSING_STEPS.DATA_COMPLETENESS: {
"repr": "Filter data completeness",
"help": "Filter data based on completeness across samples. E.g. if a protein has to be detected in at least 70% of the samples.",
Expand All @@ -93,6 +99,10 @@ class PREPROCESSING_STEPS:
"repr": "Imputation",
"help": 'Impute missing values using one of the available methods ("mean", "median", "knn", "randomforest").',
},
PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES: {
"repr": "Drop empty proteins",
"help": "Drop unmeasured features (protein groups), i.e. ones that are all NaNs or Infs.",
},
PREPROCESSING_STEPS.BATCH: {
"repr": "Batch correction",
"help": "Batch correction.",
Expand All @@ -103,10 +113,12 @@ class PREPROCESSING_STEPS:
PREPROCESSING_STEPS.REMOVE_CONTAMINATIONS,
PREPROCESSING_STEPS.REMOVE_SAMPLES,
PREPROCESSING_STEPS.SUBSET,
PREPROCESSING_STEPS.REPLACE_ZEROES,
PREPROCESSING_STEPS.DATA_COMPLETENESS,
PREPROCESSING_STEPS.LOG2_TRANSFORM,
PREPROCESSING_STEPS.NORMALIZATION,
PREPROCESSING_STEPS.IMPUTATION,
PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES,
PREPROCESSING_STEPS.BATCH,
]

Expand Down Expand Up @@ -178,14 +190,14 @@ def configure_preprocessing(dataset):
+ "[documentation](https://alphapeptstats.readthedocs.io/en/main/data_preprocessing.html)."
)

remove_contaminations = st.selectbox(
remove_contaminations = st.checkbox(
f"Remove contaminations annotated in {dataset.filter_columns}",
options=[True, False],
value=True,
)

subset = st.selectbox(
subset = st.checkbox(
"Subset data so it matches with metadata. Remove miscellanous samples in rawinput.",
options=[True, False],
value=False,
)

# TODO: value of this widget does not persist across dataset reset (likely because the metadata is reset)
Expand All @@ -195,6 +207,11 @@ def configure_preprocessing(dataset):
)
remove_samples = remove_samples if len(remove_samples) != 0 else None

replace_zeroes = st.checkbox(
"Replace 0 in the data with NaN.",
value=True,
)

data_completeness = st.number_input(
"Data completeness across samples cut-off \n(0.7 -> protein has to be detected in at least 70% of the samples)",
value=0.0,
Expand All @@ -203,9 +220,9 @@ def configure_preprocessing(dataset):
step=0.01,
)

log2_transform = st.selectbox(
"Log2-transform dataset",
options=[True, False],
log2_transform = st.checkbox(
"Log2-transform dataset. Note: If this is skipped it weill be performed on the fly for select analyses (e.g. Volcano plot).",
value=True,
)

normalization = st.selectbox(
Expand All @@ -216,6 +233,11 @@ def configure_preprocessing(dataset):
"Imputation", options=[None, "mean", "median", "knn", "randomforest"]
)

drop_unmeasured_features = st.checkbox(
"Drop unmeasured features (protein groups), i.e. ones that are all NaNs or Infs.",
value=True,
)

batch = st.selectbox(
"Batch",
options=[False] + dataset.metadata.columns.to_list(),
Expand All @@ -225,10 +247,12 @@ def configure_preprocessing(dataset):
PREPROCESSING_STEPS.REMOVE_CONTAMINATIONS: remove_contaminations,
PREPROCESSING_STEPS.REMOVE_SAMPLES: remove_samples,
PREPROCESSING_STEPS.SUBSET: subset,
PREPROCESSING_STEPS.REPLACE_ZEROES: replace_zeroes,
PREPROCESSING_STEPS.DATA_COMPLETENESS: data_completeness,
PREPROCESSING_STEPS.LOG2_TRANSFORM: log2_transform,
PREPROCESSING_STEPS.NORMALIZATION: normalization,
PREPROCESSING_STEPS.IMPUTATION: imputation,
PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES: drop_unmeasured_features,
PREPROCESSING_STEPS.BATCH: batch,
}

Expand Down
2 changes: 2 additions & 0 deletions alphastats/gui/utils/ui_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ def init_session_state() -> None:
st.session_state[StateKeys.WORKFLOW] = [
PREPROCESSING_STEPS.REMOVE_CONTAMINATIONS,
PREPROCESSING_STEPS.SUBSET,
PREPROCESSING_STEPS.REPLACE_ZEROES,
PREPROCESSING_STEPS.LOG2_TRANSFORM,
PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES,
]

if StateKeys.ANALYSIS_LIST not in st.session_state:
Expand Down
5 changes: 4 additions & 1 deletion alphastats/plots/VolcanoPlot.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,10 @@ def _annotate_result_df(self):
add color labels for up and down regulates
"""
self.res = self.res[(self.res["log2fc"] < 20) & (self.res["log2fc"] > -20)]
self.res["-log10(p-value)"] = -np.log10(self.res[self.pvalue_column])
# TODO: this is a bit hacky, but is necessary due to the masked p-values after automatic filtering. Look for a better solution where the p-values are calculated
self.res["-log10(p-value)"] = [
-np.log10(el) for el in self.res[self.pvalue_column]
]

self.alpha = -np.log10(self.alpha)
# add color variable to plot
Expand Down
21 changes: 18 additions & 3 deletions alphastats/statistics/DifferentialExpressionAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,17 @@ def _welch_ttest(self) -> pd.DataFrame:
return df

def _generic_ttest(self, test_fun: Callable) -> pd.DataFrame:
"""
Perform a t-test between two groups, assuming log-normally distributed data.
If the data was not already log transformed during preprocessing, it will be log2 transformed here. > Log2-transformed data will be used for the t-test
Parameters:
test_fun (Callable): A function that performs a t-test, e.g. scipy.stats.ttest_ind or scipy.stats.ttest_rel
Returns:
pd.DataFrame: DataFrame with index_column, p-value and log2 fold change.
"""
group1_samples = self.metadata[self.metadata[self.column] == self.group1][
Cols.SAMPLE
].tolist()
Expand All @@ -153,10 +164,16 @@ def _generic_ttest(self, test_fun: Callable) -> pd.DataFrame:
# calculate fold change (if its is not logarithimic normalized)
mat_transpose = self.mat.transpose()

if not self.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED]:
mat_transpose = mat_transpose.transform(lambda x: np.log2(x))
mat_transpose = mat_transpose.replace([np.inf, -np.inf], np.nan)

# TODO: return not only the p-value, but also the t-statistic
p_values = mat_transpose.apply(
lambda row: test_fun(
row[group1_samples].values.flatten(),
row[group2_samples].values.flatten(),
nan_policy="omit",
)[1],
axis=1,
)
Expand All @@ -170,9 +187,7 @@ def _generic_ttest(self, test_fun: Callable) -> pd.DataFrame:
mat_transpose=mat_transpose,
group1_samples=group1_samples,
group2_samples=group2_samples,
is_log2_transformed=self.preprocessing_info[
PreprocessingStateKeys.LOG2_TRANSFORMED
],
is_log2_transformed=True,
)
return df

Expand Down
21 changes: 21 additions & 0 deletions testfiles/synthetic/preprocessing_pentests.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Protein IDs,Gene names,Intensity S1,Intensity S2,Intensity S3,Intensity S4
P1,G1,39616,108153,132790,91671
P2,G2,,,,181742
P3,G3,141445,0,0,0
P4,G4,24254,25612,149501,0
P5,G5,127808,41498,,102773
P6,G6,,,,
P7,G7,0,0,0,0
P8,G8,75078,24887,104354,31023
P9,G9,42729,188031,109334,51308
P10,G10,167145,89193,23843,20795
P11;P11-2,G11,194209,121039,166156,124618
P12;P21;P22,G12;G21;G22,54389,159559,57823,42809
P13;P14-2,G13;G14,147080,178678,109757,54960
P14;P15,G14;G15,160613,115275,102138,43485
P15,,159979,108805,115492,69940
P16,G16,127875,199145,106602,27661
P17,G16,42913,141174,118502,120732
P18,G18,122464,135518,43450,54453
P19,G19,79095,61045,179863,133889
P20,G20,114108,87382,89980,165134
5 changes: 5 additions & 0 deletions testfiles/synthetic/preprocessing_pentests_metadata.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
sample,groups
S1,1
S2,1
S3,2
S4,2
2 changes: 1 addition & 1 deletion tests/gui/test_04_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_page_04_loads_with_input():
at.run()

assert not at.exception
assert at.columns[0].selectbox.len == 6
assert at.columns[0].selectbox.len == 3
assert at.button.len == 2


Expand Down
Loading

0 comments on commit f8f3ce2

Please sign in to comment.