diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py
index df31e350..184219e7 100644
--- a/alphastats/DataSet.py
+++ b/alphastats/DataSet.py
@@ -4,13 +4,13 @@
 import plotly
 import scipy
 
-from alphastats import BaseLoader
 from alphastats.dataset_factory import DataSetFactory
 from alphastats.dataset_harmonizer import DataHarmonizer
 from alphastats.DataSet_Plot import Plot
 from alphastats.DataSet_Preprocess import Preprocess
 from alphastats.DataSet_Statistics import Statistics
 from alphastats.keys import Cols
+from alphastats.loader.BaseLoader import BaseLoader
 from alphastats.plots.ClusterMap import ClusterMap
 from alphastats.plots.DimensionalityReduction import DimensionalityReduction
 from alphastats.plots.IntensityPlot import IntensityPlot
@@ -167,10 +167,12 @@ def preprocess(
         log2_transform: bool = False,
         remove_contaminations: bool = False,
         subset: bool = False,
+        replace_zeroes: bool = False,
         data_completeness: float = 0,
         normalization: str = None,
         imputation: str = None,
         remove_samples: list = None,
+        drop_unmeasured_features: bool = False,
         **kwargs,
     ) -> None:
         """A wrapper for Preprocess.preprocess(), see documentation there."""
@@ -179,10 +181,12 @@ def preprocess(
                 log2_transform,
                 remove_contaminations,
                 subset,
+                replace_zeroes,
                 data_completeness,
                 normalization,
                 imputation,
                 remove_samples,
+                drop_unmeasured_features,
                 **kwargs,
             )
         )
diff --git a/alphastats/DataSet_Preprocess.py b/alphastats/DataSet_Preprocess.py
index acc014f8..9d92a4b0 100644
--- a/alphastats/DataSet_Preprocess.py
+++ b/alphastats/DataSet_Preprocess.py
@@ -23,6 +23,7 @@ class PreprocessingStateKeys(metaclass=ConstantsClass):
     NUM_PG = "Matrix= Number of ProteinIDs/ProteinGroups"
     NUM_SAMPLES = "Matrix= Number of samples"
     INTENSITY_COLUMN = "Intensity used for analysis"
+    REPLACE_ZEROES = "Replace zero values with nan"
     LOG2_TRANSFORMED = "Log2-transformed"
     NORMALIZATION = "Normalization"
     IMPUTATION = "Imputation"
@@ -36,6 +37,7 @@ class PreprocessingStateKeys(metaclass=ConstantsClass):
         "Number of removed ProteinGroups due to data completeness cutoff"
     )
     MISSING_VALUES_REMOVED = "Missing values were removed"
+    DROP_UNMEASURED_FEATURES = "Drop unmeasured features"
 
 
 class Preprocess:
@@ -72,6 +74,7 @@ def init_preprocessing_info(
             PreprocessingStateKeys.NUM_PG: num_protein_groups,
             PreprocessingStateKeys.NUM_SAMPLES: num_samples,
             PreprocessingStateKeys.INTENSITY_COLUMN: intensity_column,
+            PreprocessingStateKeys.REPLACE_ZEROES: False,
             PreprocessingStateKeys.LOG2_TRANSFORMED: False,
             PreprocessingStateKeys.NORMALIZATION: None,
             PreprocessingStateKeys.IMPUTATION: None,
@@ -81,6 +84,7 @@ def init_preprocessing_info(
             PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: 0,
             PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: 0,
             PreprocessingStateKeys.MISSING_VALUES_REMOVED: False,
+            PreprocessingStateKeys.DROP_UNMEASURED_FEATURES: False,
         }
 
     def _remove_samples(self, sample_list: list):
@@ -117,7 +121,6 @@ def _remove_na_values(self, cut_off):
         num_samples, num_proteins = self.mat.shape
         limit = num_samples * cut
 
-        self.mat.replace(0, np.nan, inplace=True)
         keep_list = list()
         invalid = 0
         for column_name in self.mat.columns:
@@ -331,6 +334,8 @@ def _normalization(self, method: str) -> None:
 
     def _log2_transform(self):
         self.mat = np.log2(self.mat)
+        self.mat = self.mat.replace([np.inf, -np.inf], np.nan)
+        # TODO: Ideally we wouldn't need to replace infs if all downstream methods can handle them
         self.preprocessing_info.update({PreprocessingStateKeys.LOG2_TRANSFORMED: True})
         print("Data has been log2-transformed.")
 
@@ -362,10 +367,12 @@ def preprocess(
         log2_transform: bool = False,
         remove_contaminations: bool = False,
         subset: bool = False,
+        replace_zeroes: bool = False,
         data_completeness: float = 0,
         normalization: str = None,
         imputation: str = None,
         remove_samples: list = None,
+        drop_unmeasured_features: bool = False,
         **kwargs,
     ) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
         """Preprocess Protein data
@@ -427,6 +434,14 @@ def preprocess(
         if subset:
             self.mat = self.subset(self.mat, self.metadata, self.preprocessing_info)
 
+        if replace_zeroes:
+            self.mat = self.mat.replace(0, np.nan)
+            self.preprocessing_info.update(
+                {
+                    PreprocessingStateKeys.REPLACE_ZEROES: True,
+                }
+            )
+
         if data_completeness > 0:
             self._remove_na_values(cut_off=data_completeness)
 
@@ -444,9 +459,15 @@ def preprocess(
         if imputation is not None:
             self._imputation(method=imputation)
 
-        # TODO should this step be optional, too? is also done in create_matrix
-        # for now, add it to `preprocessing_info`
-        self.mat = self.mat.loc[:, (self.mat != 0).any(axis=0)]
+        if drop_unmeasured_features:
+            n = self.mat.shape[1]
+            self.mat = self.mat.loc[:, np.isfinite(self.mat).any(axis=0)]
+            self.preprocessing_info.update(
+                {
+                    PreprocessingStateKeys.DROP_UNMEASURED_FEATURES: n
+                    - self.mat.shape[1],
+                }
+            )
 
         self.preprocessing_info.update(
             {
diff --git a/alphastats/dataset_factory.py b/alphastats/dataset_factory.py
index de049cf4..607f04c2 100644
--- a/alphastats/dataset_factory.py
+++ b/alphastats/dataset_factory.py
@@ -46,12 +46,9 @@ def create_matrix_from_rawinput(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
         rawmat = df.transpose()
         rawmat.replace([np.inf, -np.inf], np.nan, inplace=True)
 
-        # remove proteins with only zero  # TODO this is re-done in preprocessing
-        mat_no_zeros = rawmat.loc[:, (rawmat != 0).any(axis=0)].astype(float)
+        self._check_matrix_values(rawmat)
 
-        self._check_matrix_values(mat_no_zeros)
-
-        return rawmat, mat_no_zeros
+        return rawmat, rawmat
 
     @staticmethod
     def _check_matrix_values(mat: pd.DataFrame) -> None:
diff --git a/alphastats/dataset_harmonizer.py b/alphastats/dataset_harmonizer.py
index a1f0a9ae..9c416690 100644
--- a/alphastats/dataset_harmonizer.py
+++ b/alphastats/dataset_harmonizer.py
@@ -4,8 +4,8 @@
 
 import pandas as pd
 
-from alphastats import BaseLoader
 from alphastats.keys import Cols
+from alphastats.loader.BaseLoader import BaseLoader
 
 
 class DataHarmonizer:
diff --git a/alphastats/gui/utils/overview_helper.py b/alphastats/gui/utils/overview_helper.py
index 12e55820..3a689e81 100644
--- a/alphastats/gui/utils/overview_helper.py
+++ b/alphastats/gui/utils/overview_helper.py
@@ -42,8 +42,8 @@ def display_loaded_dataset(dataset: DataSet) -> None:
     st.markdown("*Preview:* Matrix")
 
     df = pd.DataFrame(
-        dataset.mat.values,
-        index=dataset.mat.index.to_list(),
+        dataset.rawmat.values,
+        index=dataset.rawmat.index.to_list(),
     ).head(5)
 
     st.dataframe(df)
diff --git a/alphastats/gui/utils/preprocessing_helper.py b/alphastats/gui/utils/preprocessing_helper.py
index d583a483..22ecedfe 100644
--- a/alphastats/gui/utils/preprocessing_helper.py
+++ b/alphastats/gui/utils/preprocessing_helper.py
@@ -55,10 +55,12 @@ class PREPROCESSING_STEPS:
     REMOVE_CONTAMINATIONS = "remove_contaminations"
     REMOVE_SAMPLES = "remove_samples"
     SUBSET = "subset"
+    REPLACE_ZEROES = "replace_zeroes"
     DATA_COMPLETENESS = "data_completeness"
     LOG2_TRANSFORM = "log2_transform"
     NORMALIZATION = "normalization"
     IMPUTATION = "imputation"
+    DROP_UNMEASURED_FEATURES = "drop_unmeasured_features"
     BATCH = "batch"
 
 
@@ -77,6 +79,10 @@ class PREPROCESSING_STEPS:
         "repr": "Subset data",
         "help": "Subset data so it matches with metadata. Can for example be useful if several dimensions of an experiment were analysed together.",
     },
+    PREPROCESSING_STEPS.REPLACE_ZEROES: {
+        "repr": "0 --> NaN",
+        "help": "Replace 0 in the data with NaN.",
+    },
     PREPROCESSING_STEPS.DATA_COMPLETENESS: {
         "repr": "Filter data completeness",
         "help": "Filter data based on completeness across samples. E.g. if a protein has to be detected in at least 70% of the samples.",
@@ -93,6 +99,10 @@ class PREPROCESSING_STEPS:
         "repr": "Imputation",
         "help": 'Impute missing values using one of the available methods ("mean", "median", "knn", "randomforest").',
     },
+    PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES: {
+        "repr": "Drop empty proteins",
+        "help": "Drop unmeasured features (protein groups), i.e. ones that are all NaNs or Infs.",
+    },
     PREPROCESSING_STEPS.BATCH: {
         "repr": "Batch correction",
         "help": "Batch correction.",
@@ -103,10 +113,12 @@ class PREPROCESSING_STEPS:
     PREPROCESSING_STEPS.REMOVE_CONTAMINATIONS,
     PREPROCESSING_STEPS.REMOVE_SAMPLES,
     PREPROCESSING_STEPS.SUBSET,
+    PREPROCESSING_STEPS.REPLACE_ZEROES,
     PREPROCESSING_STEPS.DATA_COMPLETENESS,
     PREPROCESSING_STEPS.LOG2_TRANSFORM,
     PREPROCESSING_STEPS.NORMALIZATION,
     PREPROCESSING_STEPS.IMPUTATION,
+    PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES,
     PREPROCESSING_STEPS.BATCH,
 ]
 
@@ -178,14 +190,14 @@ def configure_preprocessing(dataset):
         + "[documentation](https://alphapeptstats.readthedocs.io/en/main/data_preprocessing.html)."
     )
 
-    remove_contaminations = st.selectbox(
+    remove_contaminations = st.checkbox(
         f"Remove contaminations annotated in {dataset.filter_columns}",
-        options=[True, False],
+        value=True,
     )
 
-    subset = st.selectbox(
+    subset = st.checkbox(
         "Subset data so it matches with metadata. Remove miscellanous samples in rawinput.",
-        options=[True, False],
+        value=False,
     )
 
     # TODO: value of this widget does not persist across dataset reset (likely because the metadata is reset)
@@ -195,6 +207,11 @@ def configure_preprocessing(dataset):
     )
     remove_samples = remove_samples if len(remove_samples) != 0 else None
 
+    replace_zeroes = st.checkbox(
+        "Replace 0 in the data with NaN.",
+        value=True,
+    )
+
     data_completeness = st.number_input(
         "Data completeness across samples cut-off \n(0.7 -> protein has to be detected in at least 70% of the samples)",
         value=0.0,
@@ -203,9 +220,9 @@ def configure_preprocessing(dataset):
         step=0.01,
     )
 
-    log2_transform = st.selectbox(
-        "Log2-transform dataset",
-        options=[True, False],
+    log2_transform = st.checkbox(
+        "Log2-transform dataset. Note: If this is skipped it weill be performed on the fly for select analyses (e.g. Volcano plot).",
+        value=True,
     )
 
     normalization = st.selectbox(
@@ -216,6 +233,11 @@ def configure_preprocessing(dataset):
         "Imputation", options=[None, "mean", "median", "knn", "randomforest"]
     )
 
+    drop_unmeasured_features = st.checkbox(
+        "Drop unmeasured features (protein groups), i.e. ones that are all NaNs or Infs.",
+        value=True,
+    )
+
     batch = st.selectbox(
         "Batch",
         options=[False] + dataset.metadata.columns.to_list(),
@@ -225,10 +247,12 @@ def configure_preprocessing(dataset):
         PREPROCESSING_STEPS.REMOVE_CONTAMINATIONS: remove_contaminations,
         PREPROCESSING_STEPS.REMOVE_SAMPLES: remove_samples,
         PREPROCESSING_STEPS.SUBSET: subset,
+        PREPROCESSING_STEPS.REPLACE_ZEROES: replace_zeroes,
         PREPROCESSING_STEPS.DATA_COMPLETENESS: data_completeness,
         PREPROCESSING_STEPS.LOG2_TRANSFORM: log2_transform,
         PREPROCESSING_STEPS.NORMALIZATION: normalization,
         PREPROCESSING_STEPS.IMPUTATION: imputation,
+        PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES: drop_unmeasured_features,
         PREPROCESSING_STEPS.BATCH: batch,
     }
 
diff --git a/alphastats/gui/utils/ui_helper.py b/alphastats/gui/utils/ui_helper.py
index 3b1ea738..5a09820b 100644
--- a/alphastats/gui/utils/ui_helper.py
+++ b/alphastats/gui/utils/ui_helper.py
@@ -107,7 +107,9 @@ def init_session_state() -> None:
         st.session_state[StateKeys.WORKFLOW] = [
             PREPROCESSING_STEPS.REMOVE_CONTAMINATIONS,
             PREPROCESSING_STEPS.SUBSET,
+            PREPROCESSING_STEPS.REPLACE_ZEROES,
             PREPROCESSING_STEPS.LOG2_TRANSFORM,
+            PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES,
         ]
 
     if StateKeys.ANALYSIS_LIST not in st.session_state:
diff --git a/alphastats/plots/VolcanoPlot.py b/alphastats/plots/VolcanoPlot.py
index 7c7ac787..14b15ad5 100644
--- a/alphastats/plots/VolcanoPlot.py
+++ b/alphastats/plots/VolcanoPlot.py
@@ -235,7 +235,10 @@ def _annotate_result_df(self):
         add color labels for up and down regulates
         """
         self.res = self.res[(self.res["log2fc"] < 20) & (self.res["log2fc"] > -20)]
-        self.res["-log10(p-value)"] = -np.log10(self.res[self.pvalue_column])
+        # TODO: this is a bit hacky, but is necessary due to the masked p-values after automatic filtering. Look for a better solution where the p-values are calculated
+        self.res["-log10(p-value)"] = [
+            -np.log10(el) for el in self.res[self.pvalue_column]
+        ]
 
         self.alpha = -np.log10(self.alpha)
         # add color variable to plot
diff --git a/alphastats/statistics/DifferentialExpressionAnalysis.py b/alphastats/statistics/DifferentialExpressionAnalysis.py
index 6464b5ef..aff27088 100644
--- a/alphastats/statistics/DifferentialExpressionAnalysis.py
+++ b/alphastats/statistics/DifferentialExpressionAnalysis.py
@@ -144,6 +144,17 @@ def _welch_ttest(self) -> pd.DataFrame:
         return df
 
     def _generic_ttest(self, test_fun: Callable) -> pd.DataFrame:
+        """
+        Perform a t-test between two groups, assuming log-normally distributed data.
+
+        If the data was not already log transformed during preprocessing, it will be log2 transformed here. > Log2-transformed data will be used for the t-test
+
+        Parameters:
+            test_fun (Callable): A function that performs a t-test, e.g. scipy.stats.ttest_ind or scipy.stats.ttest_rel
+
+        Returns:
+            pd.DataFrame: DataFrame with index_column, p-value and log2 fold change.
+        """
         group1_samples = self.metadata[self.metadata[self.column] == self.group1][
             Cols.SAMPLE
         ].tolist()
@@ -153,10 +164,16 @@ def _generic_ttest(self, test_fun: Callable) -> pd.DataFrame:
         # calculate fold change (if its is not logarithimic normalized)
         mat_transpose = self.mat.transpose()
 
+        if not self.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED]:
+            mat_transpose = mat_transpose.transform(lambda x: np.log2(x))
+            mat_transpose = mat_transpose.replace([np.inf, -np.inf], np.nan)
+
+        # TODO: return not only the p-value, but also the t-statistic
         p_values = mat_transpose.apply(
             lambda row: test_fun(
                 row[group1_samples].values.flatten(),
                 row[group2_samples].values.flatten(),
+                nan_policy="omit",
             )[1],
             axis=1,
         )
@@ -170,9 +187,7 @@ def _generic_ttest(self, test_fun: Callable) -> pd.DataFrame:
             mat_transpose=mat_transpose,
             group1_samples=group1_samples,
             group2_samples=group2_samples,
-            is_log2_transformed=self.preprocessing_info[
-                PreprocessingStateKeys.LOG2_TRANSFORMED
-            ],
+            is_log2_transformed=True,
         )
         return df
 
diff --git a/testfiles/synthetic/preprocessing_pentests.csv b/testfiles/synthetic/preprocessing_pentests.csv
new file mode 100644
index 00000000..311c0c00
--- /dev/null
+++ b/testfiles/synthetic/preprocessing_pentests.csv
@@ -0,0 +1,21 @@
+Protein IDs,Gene names,Intensity S1,Intensity S2,Intensity S3,Intensity S4
+P1,G1,39616,108153,132790,91671
+P2,G2,,,,181742
+P3,G3,141445,0,0,0
+P4,G4,24254,25612,149501,0
+P5,G5,127808,41498,,102773
+P6,G6,,,,
+P7,G7,0,0,0,0
+P8,G8,75078,24887,104354,31023
+P9,G9,42729,188031,109334,51308
+P10,G10,167145,89193,23843,20795
+P11;P11-2,G11,194209,121039,166156,124618
+P12;P21;P22,G12;G21;G22,54389,159559,57823,42809
+P13;P14-2,G13;G14,147080,178678,109757,54960
+P14;P15,G14;G15,160613,115275,102138,43485
+P15,,159979,108805,115492,69940
+P16,G16,127875,199145,106602,27661
+P17,G16,42913,141174,118502,120732
+P18,G18,122464,135518,43450,54453
+P19,G19,79095,61045,179863,133889
+P20,G20,114108,87382,89980,165134
diff --git a/testfiles/synthetic/preprocessing_pentests_metadata.csv b/testfiles/synthetic/preprocessing_pentests_metadata.csv
new file mode 100644
index 00000000..ba2be273
--- /dev/null
+++ b/testfiles/synthetic/preprocessing_pentests_metadata.csv
@@ -0,0 +1,5 @@
+sample,groups
+S1,1
+S2,1
+S3,2
+S4,2
diff --git a/tests/gui/test_04_preprocessing.py b/tests/gui/test_04_preprocessing.py
index b2e93905..05bd4578 100644
--- a/tests/gui/test_04_preprocessing.py
+++ b/tests/gui/test_04_preprocessing.py
@@ -24,7 +24,7 @@ def test_page_04_loads_with_input():
     at.run()
 
     assert not at.exception
-    assert at.columns[0].selectbox.len == 6
+    assert at.columns[0].selectbox.len == 3
     assert at.button.len == 2
 
 
diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py
index 3cd65ac5..70f2696a 100644
--- a/tests/test_DataSet.py
+++ b/tests/test_DataSet.py
@@ -385,14 +385,14 @@ class TestMaxQuantDataSet(BaseTestDataSet.BaseTest):
     def setUp(self):
         self.loader = MaxQuantLoader(file="testfiles/maxquant/proteinGroups.txt")
         self.metadata_path = "testfiles/maxquant/metadata.xlsx"
-        self.obj = DataSet(
+        self.obj: DataSet = DataSet(
             loader=self.loader,
             metadata_path_or_df=self.metadata_path,
             sample_column="sample",
         )
         # expected dimensions of matrix
-        self.matrix_dim = (312, 2596)
-        self.matrix_dim_filtered = (312, 2397)
+        self.matrix_dim = (312, 2611)
+        self.matrix_dim_filtered = (312, 2409)
         self.comparison_column = "disease"
 
     def test_load_evidence_wrong_sample_names(self):
@@ -413,7 +413,9 @@ def test_plot_pca_group(self):
         self.assertEqual(len(pca_plot.to_plotly_json().get("data")), 5)
 
     def test_data_completeness(self):
-        self.obj.preprocess(log2_transform=False, data_completeness=0.7)
+        self.obj.preprocess(
+            log2_transform=False, replace_zeroes=True, data_completeness=0.7
+        )
         self.assertEqual(self.obj.mat.shape[1], 159)
 
     def test_plot_pca_circles(self):
@@ -486,14 +488,15 @@ def test_plot_volcano_compare_preprocessing_modes_randomforest(self):
         self.assertEqual(len(result_list), 3)
 
     def test_preprocess_subset(self):
-        self.obj.preprocess(subset=True, log2_transform=False)
-        self.assertEqual(self.obj.mat.shape, (48, 1364))
+        self.obj.preprocess(subset=True)
+        self.assertEqual(self.obj.mat.shape[0], 48)
 
     @patch("alphastats.DataSet.DataSet.tukey_test")
     def test_anova_without_tukey(self, mock):
+        # TODO: Check why 4 extra rows are generated here. This is not due to changes made to 0 and nan filtering.
         anova_results = self.obj.anova(column="disease", protein_ids="all", tukey=False)
         self.assertEqual(anova_results["ANOVA_pvalue"][1], 0.4469688936240973)
-        self.assertEqual(anova_results.shape, (2600, 2))
+        self.assertEqual(anova_results.shape, (self.matrix_dim[1] + 4, 2))
         # check if tukey isnt called
         mock.assert_not_called()
 
@@ -518,7 +521,7 @@ def test_plot_intensity_subgroup_gracefully_handle_one_group(self):
 
     def test_anova_with_tukey(self):
         # with first 100 protein ids
-        self.obj.preprocess(imputation="mean")
+        self.obj.preprocess(data_completeness=0.05, imputation="mean")
         id_list = self.obj.mat.columns.tolist()[0:100]
         results = self.obj.anova(column="disease", protein_ids=id_list, tukey=True)
         self.assertEqual(results.shape, (100, 10))
@@ -555,7 +558,7 @@ def test_plot_volcano_with_labels(self):
             draw_line=False,
         )
         n_labels = len(plot.to_plotly_json().get("layout").get("annotations"))
-        self.assertTrue(n_labels > 20)
+        self.assertTrue(n_labels > 5)
 
     def test_plot_volcano_wald(self):
         """
@@ -624,7 +627,7 @@ def test_plot_volcano_with_labels_proteins(self):
             labels=True,
         )
         n_labels = len(plot.to_plotly_json().get("layout").get("annotations"))
-        self.assertEqual(n_labels, 20)
+        self.assertEqual(n_labels, 9)
 
     def test_plot_volcano_with_labels_proteins_welch_ttest(self):
         # remove gene names
@@ -748,10 +751,13 @@ def test_plot_samplehistograms(self):
         self.assertEqual(312, len(fig["data"]))
 
     def test_batch_correction(self):
-        self.obj.preprocess(subset=True, imputation="knn", normalization="linear")
+        self.obj.preprocess(
+            subset=True, replace_zeroes=True, data_completeness=0.1, imputation="knn"
+        )
         self.obj.batch_correction(batch="batch_artifical_added")
+        # TODO: check if batch correction worked, but not by np.isclose, as this will change whenever soemthing else about preprocessing is changed
         first_value = self.obj.mat.values[0, 0]
-        self.assertTrue(np.isclose(2.624937690577153e-08, first_value))
+        self.assertTrue(np.isclose(150490495.32554176, first_value))
 
     # TODO this opens a plot in a browser window
     @skip  # TODO multicova_analysis is unused
@@ -915,8 +921,8 @@ def setUp(self):
             sample_column="analytical_sample external_id",
         )
         # expected dimensions of matrix
-        self.matrix_dim = (20, 6)
-        self.matrix_dim_filtered = (20, 6)
+        self.matrix_dim = (20, 10)
+        self.matrix_dim_filtered = (20, 10)
         self.comparison_column = "grouping1"
 
 
@@ -986,8 +992,8 @@ def setUp(self):
         self.loader = copy.deepcopy(self.cls_loader)
         self.metadata_path = copy.deepcopy(self.cls_metadata_path)
         self.obj = copy.deepcopy(self.cls_obj)
-        self.matrix_dim = (8, 5)
-        self.matrix_dim_filtered = (8, 5)
+        self.matrix_dim = (8, 10)
+        self.matrix_dim_filtered = (8, 10)
         self.comparison_column = "grouping1"
 
     @classmethod
@@ -996,5 +1002,63 @@ def tearDownClass(cls):
             shutil.rmtree("testfiles/fragpipe/__MACOSX")
 
 
+class TestSyntheticDataSet(BaseTestDataSet.BaseTest):
+    @classmethod
+    def setUpClass(cls):
+        cls.cls_loader = GenericLoader(
+            file="testfiles/synthetic/preprocessing_pentests.csv",
+            intensity_column="Intensity [sample]",
+            index_column="Protein IDs",
+        )
+        cls.cls_metadata_path = (
+            "testfiles/synthetic/preprocessing_pentests_metadata.csv"
+        )
+        cls.cls_obj = DataSet(
+            loader=cls.cls_loader,
+            metadata_path_or_df=cls.cls_metadata_path,
+            sample_column="sample",
+        )
+
+    def setUp(self):
+        self.loader = copy.deepcopy(self.cls_loader)
+        self.metadata_path = copy.deepcopy(self.cls_metadata_path)
+        self.obj = copy.deepcopy(self.cls_obj)
+        self.matrix_dim = (4, 20)
+        self.matrix_dim_filtered = (4, 20)
+        self.comparison_column = "groups"
+
+    def test_preprocess_do_nothing(self):
+        """No preprocessing"""
+        self.obj.preprocess()
+        self.assertEqual(self.obj.mat.shape, self.matrix_dim)
+        self.assertEqual(np.isnan(self.obj.mat.values.flatten()).sum(), 8)
+
+    def test_preprocess_drop_unmeasured_features(self):
+        """Remove one completely empty row"""
+        self.obj.preprocess(drop_unmeasured_features=True)
+        self.assertEqual(self.obj.mat.shape[1], 19)
+        self.assertEqual(
+            self.obj.preprocessing_info[
+                PreprocessingStateKeys.DROP_UNMEASURED_FEATURES
+            ],
+            1,
+        )
+
+    def test_preprocess_replace_zero(self):
+        """Replace zeros with NaNs, remove two rows, leave 8 nans"""
+        self.obj.preprocess(replace_zeroes=True, drop_unmeasured_features=True)
+        self.assertEqual(self.obj.mat.shape[1], 18)
+        self.assertEqual(np.isnan(self.obj.mat.values.flatten()).sum(), 8)
+        self.assertEqual(
+            self.obj.preprocessing_info[
+                PreprocessingStateKeys.DROP_UNMEASURED_FEATURES
+            ],
+            2,
+        )
+        self.assertEqual(
+            self.obj.preprocessing_info[PreprocessingStateKeys.REPLACE_ZEROES], True
+        )
+
+
 if __name__ == "__main__":
     unittest.main()