Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

0 value handling #345

Merged
merged 23 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
0bb6202
the t-test now assumes log-normally distributed data and does log-tra…
JuliaS92 Sep 25, 2024
8c4839f
Add 0 replacement as explicit preprocessing step and remove it from d…
JuliaS92 Sep 25, 2024
828a8a5
remove all nan instead of all 0 rows
JuliaS92 Sep 25, 2024
4b60d43
repr of new step
JuliaS92 Sep 25, 2024
481a1df
Add 0 replacement to wrapper code
JuliaS92 Sep 25, 2024
2987636
fix data completeness test
JuliaS92 Sep 25, 2024
c50ec46
Make dropping of unmesured features robust, optional and transparent …
JuliaS92 Sep 27, 2024
ae84d61
Fix subset test to only look at sample dimension
JuliaS92 Sep 27, 2024
9d5503c
Fix batch correction test with major todo, removed normalization step…
JuliaS92 Sep 27, 2024
96fdfc4
Add optional step to interface
JuliaS92 Sep 27, 2024
598f54e
Fix test
JuliaS92 Sep 27, 2024
e75ec8d
Add pentest data and tests for 0 replacement and removal of unmeasure…
JuliaS92 Nov 18, 2024
2660336
Add missing state key, test for state keys, fix tests after removing …
JuliaS92 Nov 18, 2024
646a4f5
Add Note on on the fly log transformation
JuliaS92 Nov 18, 2024
d88fc99
Add todo on p-values and rename to replace_zeroes
JuliaS92 Nov 18, 2024
3a87eea
expand docu
JuliaS92 Nov 18, 2024
fdb4c77
Merge branch 'development' into 0-value-handling
mschwoer Nov 18, 2024
83f477b
fix merge conflicts
mschwoer Nov 18, 2024
bac1277
fix tests
mschwoer Nov 18, 2024
8357126
Fix Baseloader imports
JuliaS92 Nov 18, 2024
831c325
Show raw matrix on overview
JuliaS92 Nov 18, 2024
e96242d
Handle infs during on the fly log transformation
JuliaS92 Nov 18, 2024
fccd60f
Changed results after 0 handling
JuliaS92 Nov 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,12 @@ def preprocess(
log2_transform: bool = False,
remove_contaminations: bool = False,
subset: bool = False,
replace_zero: bool = False,
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
data_completeness: float = 0,
normalization: str = None,
imputation: str = None,
remove_samples: list = None,
drop_unmeasured_features: bool = False,
**kwargs,
) -> None:
"""A wrapper for Preprocess.preprocess(), see documentation there."""
Expand All @@ -167,10 +169,12 @@ def preprocess(
log2_transform,
remove_contaminations,
subset,
replace_zero,
data_completeness,
normalization,
imputation,
remove_samples,
drop_unmeasured_features,
**kwargs,
)
)
Expand Down
19 changes: 15 additions & 4 deletions alphastats/DataSet_Preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class PreprocessingStateKeys:
"Number of removed ProteinGroups due to data completeness cutoff"
)
MISSING_VALUES_REMOVED = "Missing values were removed"
DROP_UNMEASURED_FEATURES = "Drop unmeasured features"


class Preprocess:
Expand Down Expand Up @@ -84,6 +85,7 @@ def init_preprocessing_info(
PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: 0,
PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: 0,
PreprocessingStateKeys.MISSING_VALUES_REMOVED: False,
PreprocessingStateKeys.DROP_UNMEASURED_FEATURES: False,
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
}

def _remove_samples(self, sample_list: list):
Expand Down Expand Up @@ -120,7 +122,6 @@ def _remove_na_values(self, cut_off):
num_samples, num_proteins = self.mat.shape
limit = num_samples * cut

self.mat.replace(0, np.nan, inplace=True)
keep_list = list()
invalid = 0
for column_name in self.mat.columns:
Expand Down Expand Up @@ -334,6 +335,7 @@ def _normalization(self, method: str) -> None:

def _log2_transform(self):
self.mat = np.log2(self.mat)
self.mat = self.mat.replace([np.inf, -np.inf], np.nan)
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
self.preprocessing_info.update({PreprocessingStateKeys.LOG2_TRANSFORMED: True})
print("Data has been log2-transformed.")

Expand Down Expand Up @@ -365,10 +367,12 @@ def preprocess(
log2_transform: bool = False,
remove_contaminations: bool = False,
subset: bool = False,
replace_zero: bool = False,
data_completeness: float = 0,
normalization: str = None,
imputation: str = None,
remove_samples: list = None,
drop_unmeasured_features: bool = False,
**kwargs,
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
"""Preprocess Protein data
Expand Down Expand Up @@ -430,6 +434,9 @@ def preprocess(
self.mat, self.metadata, self.sample, self.preprocessing_info
)

if replace_zero:
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
self.mat = self.mat.replace(0, np.nan)

if data_completeness > 0:
self._remove_na_values(cut_off=data_completeness)

Expand All @@ -447,9 +454,13 @@ def preprocess(
if imputation is not None:
self._imputation(method=imputation)

# TODO should this step be optional, too? is also done in create_matrix
# for now, add it to `preprocessing_info`
self.mat = self.mat.loc[:, (self.mat != 0).any(axis=0)]
self.preprocessing_info.update(
{
PreprocessingStateKeys.DROP_UNMEASURED_FEATURES: drop_unmeasured_features,
}
)
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
if drop_unmeasured_features:
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
self.mat = self.mat.loc[:, np.isfinite(self.mat).any(axis=0)]

self.preprocessing_info.update(
{
Expand Down
2 changes: 2 additions & 0 deletions alphastats/gui/pages/03_Preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
st.session_state[StateKeys.WORKFLOW] = [
PREPROCESSING_STEPS.REMOVE_CONTAMINATIONS,
PREPROCESSING_STEPS.SUBSET,
PREPROCESSING_STEPS.REPLACE_ZERO,
PREPROCESSING_STEPS.LOG2_TRANSFORM,
PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES,
]

st.markdown("### Preprocessing")
Expand Down
36 changes: 30 additions & 6 deletions alphastats/gui/utils/preprocessing_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,12 @@ class PREPROCESSING_STEPS:
REMOVE_CONTAMINATIONS = "remove_contaminations"
REMOVE_SAMPLES = "remove_samples"
SUBSET = "subset"
REPLACE_ZERO = "replace_zero"
DATA_COMPLETENESS = "data_completeness"
LOG2_TRANSFORM = "log2_transform"
NORMALIZATION = "normalization"
IMPUTATION = "imputation"
DROP_UNMEASURED_FEATURES = "drop_unmeasured_features"
BATCH = "batch"


Expand All @@ -76,6 +78,10 @@ class PREPROCESSING_STEPS:
"repr": "Subset data",
"help": "Subset data so it matches with metadata. Can for example be useful if several dimensions of an experiment were analysed together.",
},
PREPROCESSING_STEPS.REPLACE_ZERO: {
"repr": "0 --> NaN",
"help": "Replace 0 in the data with NaN.",
},
PREPROCESSING_STEPS.DATA_COMPLETENESS: {
"repr": "Filter data completeness",
"help": "Filter data based on completeness across samples. E.g. if a protein has to be detected in at least 70% of the samples.",
Expand All @@ -92,6 +98,10 @@ class PREPROCESSING_STEPS:
"repr": "Imputation",
"help": 'Impute missing values using one of the available methods ("mean", "median", "knn", "randomforest").',
},
PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES: {
"repr": "Drop empty proteins",
"help": "Drop unmeasured features (protein groups).",
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
},
PREPROCESSING_STEPS.BATCH: {
"repr": "Batch correction",
"help": "Batch correction.",
Expand All @@ -102,10 +112,12 @@ class PREPROCESSING_STEPS:
PREPROCESSING_STEPS.REMOVE_CONTAMINATIONS,
PREPROCESSING_STEPS.REMOVE_SAMPLES,
PREPROCESSING_STEPS.SUBSET,
PREPROCESSING_STEPS.REPLACE_ZERO,
PREPROCESSING_STEPS.DATA_COMPLETENESS,
PREPROCESSING_STEPS.LOG2_TRANSFORM,
PREPROCESSING_STEPS.NORMALIZATION,
PREPROCESSING_STEPS.IMPUTATION,
PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES,
PREPROCESSING_STEPS.BATCH,
]

Expand Down Expand Up @@ -177,14 +189,14 @@ def configure_preprocessing(dataset):
+ "[documentation](https://alphapeptstats.readthedocs.io/en/main/data_preprocessing.html)."
)

remove_contaminations = st.selectbox(
remove_contaminations = st.checkbox(
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
f"Remove contaminations annotated in {dataset.filter_columns}",
options=[True, False],
value=True,
)

subset = st.selectbox(
subset = st.checkbox(
"Subset data so it matches with metadata. Remove miscellanous samples in rawinput.",
options=[True, False],
value=False,
)

# TODO: value of this widget does not persist across dataset reset (likely because the metadata is reset)
Expand All @@ -194,6 +206,11 @@ def configure_preprocessing(dataset):
)
remove_samples = remove_samples if len(remove_samples) != 0 else None

replace_zero = st.checkbox(
"Replace 0 in the data with NaN.",
value=True,
)

data_completeness = st.number_input(
f"Data completeness across samples cut-off \n(0.7 -> protein has to be detected in at least 70% of the samples)",
value=0.0,
Expand All @@ -202,9 +219,9 @@ def configure_preprocessing(dataset):
step=0.01,
)

log2_transform = st.selectbox(
log2_transform = st.checkbox(
"Log2-transform dataset",
options=[True, False],
value=True,
)

normalization = st.selectbox(
Expand All @@ -215,6 +232,11 @@ def configure_preprocessing(dataset):
"Imputation", options=[None, "mean", "median", "knn", "randomforest"]
)

drop_unmeasured_features = st.checkbox(
"Drop unmeasured features (protein groups)",
value=True,
)

batch = st.selectbox(
"Batch",
options=[False] + dataset.metadata.columns.to_list(),
Expand All @@ -224,10 +246,12 @@ def configure_preprocessing(dataset):
PREPROCESSING_STEPS.REMOVE_CONTAMINATIONS: remove_contaminations,
PREPROCESSING_STEPS.REMOVE_SAMPLES: remove_samples,
PREPROCESSING_STEPS.SUBSET: subset,
PREPROCESSING_STEPS.REPLACE_ZERO: replace_zero,
PREPROCESSING_STEPS.DATA_COMPLETENESS: data_completeness,
PREPROCESSING_STEPS.LOG2_TRANSFORM: log2_transform,
PREPROCESSING_STEPS.NORMALIZATION: normalization,
PREPROCESSING_STEPS.IMPUTATION: imputation,
PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES: drop_unmeasured_features,
PREPROCESSING_STEPS.BATCH: batch,
}

Expand Down
4 changes: 3 additions & 1 deletion alphastats/plots/VolcanoPlot.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,9 @@ def _annotate_result_df(self):
add color labels for up and down regulates
"""
self.res = self.res[(self.res["log2fc"] < 20) & (self.res["log2fc"] > -20)]
self.res["-log10(p-value)"] = -np.log10(self.res[self.pvalue_column])
self.res["-log10(p-value)"] = [
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
-np.log10(el) for el in self.res[self.pvalue_column]
]

self.alpha = -np.log10(self.alpha)
# add color variable to plot
Expand Down
25 changes: 17 additions & 8 deletions alphastats/statistics/DifferentialExpressionAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,14 @@ def _welch_ttest(self) -> pd.DataFrame:
return df

def _ttest(self) -> pd.DataFrame:
"""
Perform a t-test between two groups, assuming log-normally distributed data.

If the data was not already log transformed during preprocessing, it will be log2 transformed here.
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved

Returns:
pd.DataFrame: DataFrame with index_column, p-value and log2 fold change.
"""
group1_samples = self.metadata[self.metadata[self.column] == self.group1][
self.sample
].tolist()
Expand All @@ -179,15 +187,20 @@ def _ttest(self) -> pd.DataFrame:
# calculate fold change (if its is not logarithimic normalized)
mat_transpose = self.mat.transpose()

if not self.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED]:
mat_transpose = mat_transpose.transform(lambda x: np.log2(x))
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved

# TODO: return not only the p-value, but also the t-statistic
p_values = mat_transpose.apply(
lambda row: scipy.stats.ttest_ind(
row[group1_samples].values.flatten(),
row[group2_samples].values.flatten(),
nan_policy="omit",
)[1],
axis=1,
)

fc = self._calculate_foldchange(
fc = self._calculate_logfoldchange(
mat_transpose=mat_transpose,
group1_samples=group1_samples,
group2_samples=group2_samples,
Expand Down Expand Up @@ -218,7 +231,7 @@ def _pairedttest(self) -> pd.DataFrame:
axis=1,
)

fc = self._calculate_foldchange(
fc = self._calculate_logfoldchange(
mat_transpose=mat_transpose,
group1_samples=group1_samples,
group2_samples=group2_samples,
Expand All @@ -231,17 +244,13 @@ def _pairedttest(self) -> pd.DataFrame:
df["log2fc"] = fc
return df

def _calculate_foldchange( # TODO duplicated
def _calculate_logfoldchange( # TODO duplicated
self, mat_transpose: pd.DataFrame, group1_samples: list, group2_samples: list
):
group1_values = mat_transpose[group1_samples].T.mean().values
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
group2_values = mat_transpose[group2_samples].T.mean().values
if self.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED]:
fc = group1_values - group2_values

else:
fc = group1_values / group2_values
fc = np.log2(fc)
fc = group1_values - group2_values

return fc

Expand Down
2 changes: 1 addition & 1 deletion tests/gui/test_04_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_page_04_loads_with_input():
at.run()

assert not at.exception
assert at.columns[3].selectbox.len == 6
assert at.columns[3].selectbox.len == 3
assert at.button.len == 2
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved


Expand Down
15 changes: 10 additions & 5 deletions tests/test_DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,9 @@ def test_plot_pca_group(self):
self.assertEqual(len(pca_plot.to_plotly_json().get("data")), 5)

def test_data_completeness(self):
self.obj.preprocess(log2_transform=False, data_completeness=0.7)
self.obj.preprocess(
log2_transform=False, replace_zero=True, data_completeness=0.7
)
self.assertEqual(self.obj.mat.shape[1], 159)

def test_plot_pca_circles(self):
Expand Down Expand Up @@ -502,8 +504,8 @@ def test_plot_volcano_compare_preprocessing_modes_randomforest(self):
self.assertEqual(len(result_list), 3)

def test_preprocess_subset(self):
self.obj.preprocess(subset=True, log2_transform=False)
self.assertEqual(self.obj.mat.shape, (48, 1364))
self.obj.preprocess(subset=True)
self.assertEqual(self.obj.mat.shape[0], 48)

@patch("alphastats.DataSet.DataSet.tukey_test")
def test_anova_without_tukey(self, mock):
Expand Down Expand Up @@ -765,10 +767,13 @@ def test_plot_samplehistograms(self):
self.assertEqual(312, len(fig["data"]))

def test_batch_correction(self):
self.obj.preprocess(subset=True, imputation="knn", normalization="linear")
self.obj.preprocess(
subset=True, replace_zero=True, data_completeness=0.1, imputation="knn"
)
self.obj.batch_correction(batch="batch_artifical_added")
# TODO: check if batch correction worked, but not by np.isclose, as this will change whenever soemthing else about preprocessing is changed
first_value = self.obj.mat.values[0, 0]
self.assertTrue(np.isclose(2.624937690577153e-08, first_value))
self.assertTrue(np.isclose(150490495.32554176, first_value))

# TODO this opens a plot in a browser window
@skip # TODO multicova_analysis is unused
Expand Down
Loading