diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index df31e350..184219e7 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -4,13 +4,13 @@ import plotly import scipy -from alphastats import BaseLoader from alphastats.dataset_factory import DataSetFactory from alphastats.dataset_harmonizer import DataHarmonizer from alphastats.DataSet_Plot import Plot from alphastats.DataSet_Preprocess import Preprocess from alphastats.DataSet_Statistics import Statistics from alphastats.keys import Cols +from alphastats.loader.BaseLoader import BaseLoader from alphastats.plots.ClusterMap import ClusterMap from alphastats.plots.DimensionalityReduction import DimensionalityReduction from alphastats.plots.IntensityPlot import IntensityPlot @@ -167,10 +167,12 @@ def preprocess( log2_transform: bool = False, remove_contaminations: bool = False, subset: bool = False, + replace_zeroes: bool = False, data_completeness: float = 0, normalization: str = None, imputation: str = None, remove_samples: list = None, + drop_unmeasured_features: bool = False, **kwargs, ) -> None: """A wrapper for Preprocess.preprocess(), see documentation there.""" @@ -179,10 +181,12 @@ def preprocess( log2_transform, remove_contaminations, subset, + replace_zeroes, data_completeness, normalization, imputation, remove_samples, + drop_unmeasured_features, **kwargs, ) ) diff --git a/alphastats/DataSet_Preprocess.py b/alphastats/DataSet_Preprocess.py index acc014f8..9d92a4b0 100644 --- a/alphastats/DataSet_Preprocess.py +++ b/alphastats/DataSet_Preprocess.py @@ -23,6 +23,7 @@ class PreprocessingStateKeys(metaclass=ConstantsClass): NUM_PG = "Matrix= Number of ProteinIDs/ProteinGroups" NUM_SAMPLES = "Matrix= Number of samples" INTENSITY_COLUMN = "Intensity used for analysis" + REPLACE_ZEROES = "Replace zero values with nan" LOG2_TRANSFORMED = "Log2-transformed" NORMALIZATION = "Normalization" IMPUTATION = "Imputation" @@ -36,6 +37,7 @@ class PreprocessingStateKeys(metaclass=ConstantsClass): "Number of removed ProteinGroups due to data completeness cutoff" ) MISSING_VALUES_REMOVED = "Missing values were removed" + DROP_UNMEASURED_FEATURES = "Drop unmeasured features" class Preprocess: @@ -72,6 +74,7 @@ def init_preprocessing_info( PreprocessingStateKeys.NUM_PG: num_protein_groups, PreprocessingStateKeys.NUM_SAMPLES: num_samples, PreprocessingStateKeys.INTENSITY_COLUMN: intensity_column, + PreprocessingStateKeys.REPLACE_ZEROES: False, PreprocessingStateKeys.LOG2_TRANSFORMED: False, PreprocessingStateKeys.NORMALIZATION: None, PreprocessingStateKeys.IMPUTATION: None, @@ -81,6 +84,7 @@ def init_preprocessing_info( PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: 0, PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: 0, PreprocessingStateKeys.MISSING_VALUES_REMOVED: False, + PreprocessingStateKeys.DROP_UNMEASURED_FEATURES: False, } def _remove_samples(self, sample_list: list): @@ -117,7 +121,6 @@ def _remove_na_values(self, cut_off): num_samples, num_proteins = self.mat.shape limit = num_samples * cut - self.mat.replace(0, np.nan, inplace=True) keep_list = list() invalid = 0 for column_name in self.mat.columns: @@ -331,6 +334,8 @@ def _normalization(self, method: str) -> None: def _log2_transform(self): self.mat = np.log2(self.mat) + self.mat = self.mat.replace([np.inf, -np.inf], np.nan) + # TODO: Ideally we wouldn't need to replace infs if all downstream methods can handle them self.preprocessing_info.update({PreprocessingStateKeys.LOG2_TRANSFORMED: True}) print("Data has been log2-transformed.") @@ -362,10 +367,12 @@ def preprocess( log2_transform: bool = False, remove_contaminations: bool = False, subset: bool = False, + replace_zeroes: bool = False, data_completeness: float = 0, normalization: str = None, imputation: str = None, remove_samples: list = None, + drop_unmeasured_features: bool = False, **kwargs, ) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]: """Preprocess Protein data @@ -427,6 +434,14 @@ def preprocess( if subset: self.mat = self.subset(self.mat, self.metadata, self.preprocessing_info) + if replace_zeroes: + self.mat = self.mat.replace(0, np.nan) + self.preprocessing_info.update( + { + PreprocessingStateKeys.REPLACE_ZEROES: True, + } + ) + if data_completeness > 0: self._remove_na_values(cut_off=data_completeness) @@ -444,9 +459,15 @@ def preprocess( if imputation is not None: self._imputation(method=imputation) - # TODO should this step be optional, too? is also done in create_matrix - # for now, add it to `preprocessing_info` - self.mat = self.mat.loc[:, (self.mat != 0).any(axis=0)] + if drop_unmeasured_features: + n = self.mat.shape[1] + self.mat = self.mat.loc[:, np.isfinite(self.mat).any(axis=0)] + self.preprocessing_info.update( + { + PreprocessingStateKeys.DROP_UNMEASURED_FEATURES: n + - self.mat.shape[1], + } + ) self.preprocessing_info.update( { diff --git a/alphastats/dataset_factory.py b/alphastats/dataset_factory.py index de049cf4..607f04c2 100644 --- a/alphastats/dataset_factory.py +++ b/alphastats/dataset_factory.py @@ -46,12 +46,9 @@ def create_matrix_from_rawinput(self) -> Tuple[pd.DataFrame, pd.DataFrame]: rawmat = df.transpose() rawmat.replace([np.inf, -np.inf], np.nan, inplace=True) - # remove proteins with only zero # TODO this is re-done in preprocessing - mat_no_zeros = rawmat.loc[:, (rawmat != 0).any(axis=0)].astype(float) + self._check_matrix_values(rawmat) - self._check_matrix_values(mat_no_zeros) - - return rawmat, mat_no_zeros + return rawmat, rawmat @staticmethod def _check_matrix_values(mat: pd.DataFrame) -> None: diff --git a/alphastats/dataset_harmonizer.py b/alphastats/dataset_harmonizer.py index a1f0a9ae..9c416690 100644 --- a/alphastats/dataset_harmonizer.py +++ b/alphastats/dataset_harmonizer.py @@ -4,8 +4,8 @@ import pandas as pd -from alphastats import BaseLoader from alphastats.keys import Cols +from alphastats.loader.BaseLoader import BaseLoader class DataHarmonizer: diff --git a/alphastats/gui/utils/overview_helper.py b/alphastats/gui/utils/overview_helper.py index 12e55820..3a689e81 100644 --- a/alphastats/gui/utils/overview_helper.py +++ b/alphastats/gui/utils/overview_helper.py @@ -42,8 +42,8 @@ def display_loaded_dataset(dataset: DataSet) -> None: st.markdown("*Preview:* Matrix") df = pd.DataFrame( - dataset.mat.values, - index=dataset.mat.index.to_list(), + dataset.rawmat.values, + index=dataset.rawmat.index.to_list(), ).head(5) st.dataframe(df) diff --git a/alphastats/gui/utils/preprocessing_helper.py b/alphastats/gui/utils/preprocessing_helper.py index d583a483..22ecedfe 100644 --- a/alphastats/gui/utils/preprocessing_helper.py +++ b/alphastats/gui/utils/preprocessing_helper.py @@ -55,10 +55,12 @@ class PREPROCESSING_STEPS: REMOVE_CONTAMINATIONS = "remove_contaminations" REMOVE_SAMPLES = "remove_samples" SUBSET = "subset" + REPLACE_ZEROES = "replace_zeroes" DATA_COMPLETENESS = "data_completeness" LOG2_TRANSFORM = "log2_transform" NORMALIZATION = "normalization" IMPUTATION = "imputation" + DROP_UNMEASURED_FEATURES = "drop_unmeasured_features" BATCH = "batch" @@ -77,6 +79,10 @@ class PREPROCESSING_STEPS: "repr": "Subset data", "help": "Subset data so it matches with metadata. Can for example be useful if several dimensions of an experiment were analysed together.", }, + PREPROCESSING_STEPS.REPLACE_ZEROES: { + "repr": "0 --> NaN", + "help": "Replace 0 in the data with NaN.", + }, PREPROCESSING_STEPS.DATA_COMPLETENESS: { "repr": "Filter data completeness", "help": "Filter data based on completeness across samples. E.g. if a protein has to be detected in at least 70% of the samples.", @@ -93,6 +99,10 @@ class PREPROCESSING_STEPS: "repr": "Imputation", "help": 'Impute missing values using one of the available methods ("mean", "median", "knn", "randomforest").', }, + PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES: { + "repr": "Drop empty proteins", + "help": "Drop unmeasured features (protein groups), i.e. ones that are all NaNs or Infs.", + }, PREPROCESSING_STEPS.BATCH: { "repr": "Batch correction", "help": "Batch correction.", @@ -103,10 +113,12 @@ class PREPROCESSING_STEPS: PREPROCESSING_STEPS.REMOVE_CONTAMINATIONS, PREPROCESSING_STEPS.REMOVE_SAMPLES, PREPROCESSING_STEPS.SUBSET, + PREPROCESSING_STEPS.REPLACE_ZEROES, PREPROCESSING_STEPS.DATA_COMPLETENESS, PREPROCESSING_STEPS.LOG2_TRANSFORM, PREPROCESSING_STEPS.NORMALIZATION, PREPROCESSING_STEPS.IMPUTATION, + PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES, PREPROCESSING_STEPS.BATCH, ] @@ -178,14 +190,14 @@ def configure_preprocessing(dataset): + "[documentation](https://alphapeptstats.readthedocs.io/en/main/data_preprocessing.html)." ) - remove_contaminations = st.selectbox( + remove_contaminations = st.checkbox( f"Remove contaminations annotated in {dataset.filter_columns}", - options=[True, False], + value=True, ) - subset = st.selectbox( + subset = st.checkbox( "Subset data so it matches with metadata. Remove miscellanous samples in rawinput.", - options=[True, False], + value=False, ) # TODO: value of this widget does not persist across dataset reset (likely because the metadata is reset) @@ -195,6 +207,11 @@ def configure_preprocessing(dataset): ) remove_samples = remove_samples if len(remove_samples) != 0 else None + replace_zeroes = st.checkbox( + "Replace 0 in the data with NaN.", + value=True, + ) + data_completeness = st.number_input( "Data completeness across samples cut-off \n(0.7 -> protein has to be detected in at least 70% of the samples)", value=0.0, @@ -203,9 +220,9 @@ def configure_preprocessing(dataset): step=0.01, ) - log2_transform = st.selectbox( - "Log2-transform dataset", - options=[True, False], + log2_transform = st.checkbox( + "Log2-transform dataset. Note: If this is skipped it weill be performed on the fly for select analyses (e.g. Volcano plot).", + value=True, ) normalization = st.selectbox( @@ -216,6 +233,11 @@ def configure_preprocessing(dataset): "Imputation", options=[None, "mean", "median", "knn", "randomforest"] ) + drop_unmeasured_features = st.checkbox( + "Drop unmeasured features (protein groups), i.e. ones that are all NaNs or Infs.", + value=True, + ) + batch = st.selectbox( "Batch", options=[False] + dataset.metadata.columns.to_list(), @@ -225,10 +247,12 @@ def configure_preprocessing(dataset): PREPROCESSING_STEPS.REMOVE_CONTAMINATIONS: remove_contaminations, PREPROCESSING_STEPS.REMOVE_SAMPLES: remove_samples, PREPROCESSING_STEPS.SUBSET: subset, + PREPROCESSING_STEPS.REPLACE_ZEROES: replace_zeroes, PREPROCESSING_STEPS.DATA_COMPLETENESS: data_completeness, PREPROCESSING_STEPS.LOG2_TRANSFORM: log2_transform, PREPROCESSING_STEPS.NORMALIZATION: normalization, PREPROCESSING_STEPS.IMPUTATION: imputation, + PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES: drop_unmeasured_features, PREPROCESSING_STEPS.BATCH: batch, } diff --git a/alphastats/gui/utils/ui_helper.py b/alphastats/gui/utils/ui_helper.py index 3b1ea738..5a09820b 100644 --- a/alphastats/gui/utils/ui_helper.py +++ b/alphastats/gui/utils/ui_helper.py @@ -107,7 +107,9 @@ def init_session_state() -> None: st.session_state[StateKeys.WORKFLOW] = [ PREPROCESSING_STEPS.REMOVE_CONTAMINATIONS, PREPROCESSING_STEPS.SUBSET, + PREPROCESSING_STEPS.REPLACE_ZEROES, PREPROCESSING_STEPS.LOG2_TRANSFORM, + PREPROCESSING_STEPS.DROP_UNMEASURED_FEATURES, ] if StateKeys.ANALYSIS_LIST not in st.session_state: diff --git a/alphastats/plots/VolcanoPlot.py b/alphastats/plots/VolcanoPlot.py index 7c7ac787..14b15ad5 100644 --- a/alphastats/plots/VolcanoPlot.py +++ b/alphastats/plots/VolcanoPlot.py @@ -235,7 +235,10 @@ def _annotate_result_df(self): add color labels for up and down regulates """ self.res = self.res[(self.res["log2fc"] < 20) & (self.res["log2fc"] > -20)] - self.res["-log10(p-value)"] = -np.log10(self.res[self.pvalue_column]) + # TODO: this is a bit hacky, but is necessary due to the masked p-values after automatic filtering. Look for a better solution where the p-values are calculated + self.res["-log10(p-value)"] = [ + -np.log10(el) for el in self.res[self.pvalue_column] + ] self.alpha = -np.log10(self.alpha) # add color variable to plot diff --git a/alphastats/statistics/DifferentialExpressionAnalysis.py b/alphastats/statistics/DifferentialExpressionAnalysis.py index 6464b5ef..aff27088 100644 --- a/alphastats/statistics/DifferentialExpressionAnalysis.py +++ b/alphastats/statistics/DifferentialExpressionAnalysis.py @@ -144,6 +144,17 @@ def _welch_ttest(self) -> pd.DataFrame: return df def _generic_ttest(self, test_fun: Callable) -> pd.DataFrame: + """ + Perform a t-test between two groups, assuming log-normally distributed data. + + If the data was not already log transformed during preprocessing, it will be log2 transformed here. > Log2-transformed data will be used for the t-test + + Parameters: + test_fun (Callable): A function that performs a t-test, e.g. scipy.stats.ttest_ind or scipy.stats.ttest_rel + + Returns: + pd.DataFrame: DataFrame with index_column, p-value and log2 fold change. + """ group1_samples = self.metadata[self.metadata[self.column] == self.group1][ Cols.SAMPLE ].tolist() @@ -153,10 +164,16 @@ def _generic_ttest(self, test_fun: Callable) -> pd.DataFrame: # calculate fold change (if its is not logarithimic normalized) mat_transpose = self.mat.transpose() + if not self.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED]: + mat_transpose = mat_transpose.transform(lambda x: np.log2(x)) + mat_transpose = mat_transpose.replace([np.inf, -np.inf], np.nan) + + # TODO: return not only the p-value, but also the t-statistic p_values = mat_transpose.apply( lambda row: test_fun( row[group1_samples].values.flatten(), row[group2_samples].values.flatten(), + nan_policy="omit", )[1], axis=1, ) @@ -170,9 +187,7 @@ def _generic_ttest(self, test_fun: Callable) -> pd.DataFrame: mat_transpose=mat_transpose, group1_samples=group1_samples, group2_samples=group2_samples, - is_log2_transformed=self.preprocessing_info[ - PreprocessingStateKeys.LOG2_TRANSFORMED - ], + is_log2_transformed=True, ) return df diff --git a/testfiles/synthetic/preprocessing_pentests.csv b/testfiles/synthetic/preprocessing_pentests.csv new file mode 100644 index 00000000..311c0c00 --- /dev/null +++ b/testfiles/synthetic/preprocessing_pentests.csv @@ -0,0 +1,21 @@ +Protein IDs,Gene names,Intensity S1,Intensity S2,Intensity S3,Intensity S4 +P1,G1,39616,108153,132790,91671 +P2,G2,,,,181742 +P3,G3,141445,0,0,0 +P4,G4,24254,25612,149501,0 +P5,G5,127808,41498,,102773 +P6,G6,,,, +P7,G7,0,0,0,0 +P8,G8,75078,24887,104354,31023 +P9,G9,42729,188031,109334,51308 +P10,G10,167145,89193,23843,20795 +P11;P11-2,G11,194209,121039,166156,124618 +P12;P21;P22,G12;G21;G22,54389,159559,57823,42809 +P13;P14-2,G13;G14,147080,178678,109757,54960 +P14;P15,G14;G15,160613,115275,102138,43485 +P15,,159979,108805,115492,69940 +P16,G16,127875,199145,106602,27661 +P17,G16,42913,141174,118502,120732 +P18,G18,122464,135518,43450,54453 +P19,G19,79095,61045,179863,133889 +P20,G20,114108,87382,89980,165134 diff --git a/testfiles/synthetic/preprocessing_pentests_metadata.csv b/testfiles/synthetic/preprocessing_pentests_metadata.csv new file mode 100644 index 00000000..ba2be273 --- /dev/null +++ b/testfiles/synthetic/preprocessing_pentests_metadata.csv @@ -0,0 +1,5 @@ +sample,groups +S1,1 +S2,1 +S3,2 +S4,2 diff --git a/tests/gui/test_04_preprocessing.py b/tests/gui/test_04_preprocessing.py index b2e93905..05bd4578 100644 --- a/tests/gui/test_04_preprocessing.py +++ b/tests/gui/test_04_preprocessing.py @@ -24,7 +24,7 @@ def test_page_04_loads_with_input(): at.run() assert not at.exception - assert at.columns[0].selectbox.len == 6 + assert at.columns[0].selectbox.len == 3 assert at.button.len == 2 diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py index 3cd65ac5..70f2696a 100644 --- a/tests/test_DataSet.py +++ b/tests/test_DataSet.py @@ -385,14 +385,14 @@ class TestMaxQuantDataSet(BaseTestDataSet.BaseTest): def setUp(self): self.loader = MaxQuantLoader(file="testfiles/maxquant/proteinGroups.txt") self.metadata_path = "testfiles/maxquant/metadata.xlsx" - self.obj = DataSet( + self.obj: DataSet = DataSet( loader=self.loader, metadata_path_or_df=self.metadata_path, sample_column="sample", ) # expected dimensions of matrix - self.matrix_dim = (312, 2596) - self.matrix_dim_filtered = (312, 2397) + self.matrix_dim = (312, 2611) + self.matrix_dim_filtered = (312, 2409) self.comparison_column = "disease" def test_load_evidence_wrong_sample_names(self): @@ -413,7 +413,9 @@ def test_plot_pca_group(self): self.assertEqual(len(pca_plot.to_plotly_json().get("data")), 5) def test_data_completeness(self): - self.obj.preprocess(log2_transform=False, data_completeness=0.7) + self.obj.preprocess( + log2_transform=False, replace_zeroes=True, data_completeness=0.7 + ) self.assertEqual(self.obj.mat.shape[1], 159) def test_plot_pca_circles(self): @@ -486,14 +488,15 @@ def test_plot_volcano_compare_preprocessing_modes_randomforest(self): self.assertEqual(len(result_list), 3) def test_preprocess_subset(self): - self.obj.preprocess(subset=True, log2_transform=False) - self.assertEqual(self.obj.mat.shape, (48, 1364)) + self.obj.preprocess(subset=True) + self.assertEqual(self.obj.mat.shape[0], 48) @patch("alphastats.DataSet.DataSet.tukey_test") def test_anova_without_tukey(self, mock): + # TODO: Check why 4 extra rows are generated here. This is not due to changes made to 0 and nan filtering. anova_results = self.obj.anova(column="disease", protein_ids="all", tukey=False) self.assertEqual(anova_results["ANOVA_pvalue"][1], 0.4469688936240973) - self.assertEqual(anova_results.shape, (2600, 2)) + self.assertEqual(anova_results.shape, (self.matrix_dim[1] + 4, 2)) # check if tukey isnt called mock.assert_not_called() @@ -518,7 +521,7 @@ def test_plot_intensity_subgroup_gracefully_handle_one_group(self): def test_anova_with_tukey(self): # with first 100 protein ids - self.obj.preprocess(imputation="mean") + self.obj.preprocess(data_completeness=0.05, imputation="mean") id_list = self.obj.mat.columns.tolist()[0:100] results = self.obj.anova(column="disease", protein_ids=id_list, tukey=True) self.assertEqual(results.shape, (100, 10)) @@ -555,7 +558,7 @@ def test_plot_volcano_with_labels(self): draw_line=False, ) n_labels = len(plot.to_plotly_json().get("layout").get("annotations")) - self.assertTrue(n_labels > 20) + self.assertTrue(n_labels > 5) def test_plot_volcano_wald(self): """ @@ -624,7 +627,7 @@ def test_plot_volcano_with_labels_proteins(self): labels=True, ) n_labels = len(plot.to_plotly_json().get("layout").get("annotations")) - self.assertEqual(n_labels, 20) + self.assertEqual(n_labels, 9) def test_plot_volcano_with_labels_proteins_welch_ttest(self): # remove gene names @@ -748,10 +751,13 @@ def test_plot_samplehistograms(self): self.assertEqual(312, len(fig["data"])) def test_batch_correction(self): - self.obj.preprocess(subset=True, imputation="knn", normalization="linear") + self.obj.preprocess( + subset=True, replace_zeroes=True, data_completeness=0.1, imputation="knn" + ) self.obj.batch_correction(batch="batch_artifical_added") + # TODO: check if batch correction worked, but not by np.isclose, as this will change whenever soemthing else about preprocessing is changed first_value = self.obj.mat.values[0, 0] - self.assertTrue(np.isclose(2.624937690577153e-08, first_value)) + self.assertTrue(np.isclose(150490495.32554176, first_value)) # TODO this opens a plot in a browser window @skip # TODO multicova_analysis is unused @@ -915,8 +921,8 @@ def setUp(self): sample_column="analytical_sample external_id", ) # expected dimensions of matrix - self.matrix_dim = (20, 6) - self.matrix_dim_filtered = (20, 6) + self.matrix_dim = (20, 10) + self.matrix_dim_filtered = (20, 10) self.comparison_column = "grouping1" @@ -986,8 +992,8 @@ def setUp(self): self.loader = copy.deepcopy(self.cls_loader) self.metadata_path = copy.deepcopy(self.cls_metadata_path) self.obj = copy.deepcopy(self.cls_obj) - self.matrix_dim = (8, 5) - self.matrix_dim_filtered = (8, 5) + self.matrix_dim = (8, 10) + self.matrix_dim_filtered = (8, 10) self.comparison_column = "grouping1" @classmethod @@ -996,5 +1002,63 @@ def tearDownClass(cls): shutil.rmtree("testfiles/fragpipe/__MACOSX") +class TestSyntheticDataSet(BaseTestDataSet.BaseTest): + @classmethod + def setUpClass(cls): + cls.cls_loader = GenericLoader( + file="testfiles/synthetic/preprocessing_pentests.csv", + intensity_column="Intensity [sample]", + index_column="Protein IDs", + ) + cls.cls_metadata_path = ( + "testfiles/synthetic/preprocessing_pentests_metadata.csv" + ) + cls.cls_obj = DataSet( + loader=cls.cls_loader, + metadata_path_or_df=cls.cls_metadata_path, + sample_column="sample", + ) + + def setUp(self): + self.loader = copy.deepcopy(self.cls_loader) + self.metadata_path = copy.deepcopy(self.cls_metadata_path) + self.obj = copy.deepcopy(self.cls_obj) + self.matrix_dim = (4, 20) + self.matrix_dim_filtered = (4, 20) + self.comparison_column = "groups" + + def test_preprocess_do_nothing(self): + """No preprocessing""" + self.obj.preprocess() + self.assertEqual(self.obj.mat.shape, self.matrix_dim) + self.assertEqual(np.isnan(self.obj.mat.values.flatten()).sum(), 8) + + def test_preprocess_drop_unmeasured_features(self): + """Remove one completely empty row""" + self.obj.preprocess(drop_unmeasured_features=True) + self.assertEqual(self.obj.mat.shape[1], 19) + self.assertEqual( + self.obj.preprocessing_info[ + PreprocessingStateKeys.DROP_UNMEASURED_FEATURES + ], + 1, + ) + + def test_preprocess_replace_zero(self): + """Replace zeros with NaNs, remove two rows, leave 8 nans""" + self.obj.preprocess(replace_zeroes=True, drop_unmeasured_features=True) + self.assertEqual(self.obj.mat.shape[1], 18) + self.assertEqual(np.isnan(self.obj.mat.values.flatten()).sum(), 8) + self.assertEqual( + self.obj.preprocessing_info[ + PreprocessingStateKeys.DROP_UNMEASURED_FEATURES + ], + 2, + ) + self.assertEqual( + self.obj.preprocessing_info[PreprocessingStateKeys.REPLACE_ZEROES], True + ) + + if __name__ == "__main__": unittest.main()