From ccd6a18c4c5422ef1e6635244248d21e2c066ec3 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Tue, 17 Sep 2024 12:49:56 +0200 Subject: [PATCH 1/4] introduce PreprocessingKeys --- alphastats/DataSet.py | 26 +++++----- alphastats/DataSet_Preprocess.py | 49 ++++++++++++++----- alphastats/DataSet_Statistics.py | 8 ++- alphastats/__init__.py | 3 +- alphastats/gui/pages/02_Import Data.py | 3 +- alphastats/gui/utils/overview_helper.py | 21 ++++++-- alphastats/gui/utils/preprocessing_helper.py | 2 +- alphastats/plots/VolcanoPlot.py | 8 ++- .../DifferentialExpressionAnalysis.py | 9 +++- tests/gui/test_04_preprocessing.py | 5 +- tests/test_DataSet.py | 9 +++- 11 files changed, 96 insertions(+), 47 deletions(-) diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index afeaf3f1..cae9ec15 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -10,7 +10,7 @@ from alphastats.DataSet_Plot import Plot -from alphastats.DataSet_Preprocess import Preprocess +from alphastats.DataSet_Preprocess import Preprocess, PreprocessingStateKeys from alphastats.DataSet_Pathway import Enrichment from alphastats.DataSet_Statistics import Statistics from alphastats.utils import LoaderError @@ -188,7 +188,7 @@ def _remove_misc_samples_in_metadata(self): def _subset(self): # filter matrix so only samples that are described in metadata are also found in matrix self.preprocessing_info.update( - {"Matrix: Number of samples": self.metadata.shape[0]} + {PreprocessingStateKeys.NUM_SAMPLES: self.metadata.shape[0]} ) return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())] @@ -265,17 +265,17 @@ def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame: def _save_dataset_info(self): n_proteingroups = self.mat.shape[1] preprocessing_dict = { - "Raw data number of Protein Groups": n_proteingroups, - "Matrix: Number of ProteinIDs/ProteinGroups": self.mat.shape[1], - "Matrix: Number of samples": self.mat.shape[0], - "Intensity used for analysis": self.intensity_column, - "Log2-transformed": False, - "Normalization": None, - "Imputation": None, - "Contaminations have been removed": False, - "Contamination columns": self.filter_columns, - "Number of removed ProteinGroups due to contaminaton": 0, - "Data completeness cut-off": 0, + PreprocessingStateKeys.RAW_DATA_NUM_PG: n_proteingroups, + PreprocessingStateKeys.NUM_PG: self.mat.shape[1], + PreprocessingStateKeys.NUM_SAMPLES: self.mat.shape[0], + PreprocessingStateKeys.INTENSITY_COLUMN: self.intensity_column, + PreprocessingStateKeys.LOG2_TRANSFORMED: False, + PreprocessingStateKeys.NORMALIZATION: None, + PreprocessingStateKeys.IMPUTATION: None, + PreprocessingStateKeys.CONTAMINATIONS_REMOVED: False, + PreprocessingStateKeys.CONTAMINATION_COLUMNS: self.filter_columns, + PreprocessingStateKeys.NUM_REMOVED_PG_DUE_TO_CONTAMINATION: 0, + PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: 0, } return preprocessing_dict diff --git a/alphastats/DataSet_Preprocess.py b/alphastats/DataSet_Preprocess.py index 074d252d..04f74082 100644 --- a/alphastats/DataSet_Preprocess.py +++ b/alphastats/DataSet_Preprocess.py @@ -13,6 +13,24 @@ from alphastats.utils import ignore_warning +class PreprocessingStateKeys: + """Keys for accessing the dictionary holding the information about preprocessing.""" + + RAW_DATA_NUM_PG = "Raw data number of Protein Groups" + NUM_PG = "Matrix= Number of ProteinIDs/ProteinGroups" + NUM_SAMPLES = "Matrix= Number of samples" + INTENSITY_COLUMN = "Intensity used for analysis" + LOG2_TRANSFORMED = "Log2-transformed" + NORMALIZATION = "Normalization" + IMPUTATION = "Imputation" + CONTAMINATIONS_REMOVED = "Contaminations have been removed" + CONTAMINATION_COLUMNS = "Contamination columns" + NUM_REMOVED_PG_DUE_TO_CONTAMINATION = ( + "Number of removed ProteinGroups due to contaminaton" + ) + DATA_COMPLETENESS_CUTOFF = "Data completeness cut-off" + + class Preprocess: imputation_methods = ["mean", "median", "knn", "randomforest"] normalization_methods = ["vst", "zscore", "quantile"] @@ -45,14 +63,17 @@ def _remove_samples(self, sample_list: list): def _subset(self): # filter matrix so only samples that are described in metadata are also found in matrix self.preprocessing_info.update( - {"Matrix: Number of samples": self.metadata.shape[0]} + {PreprocessingStateKeys.NUM_SAMPLES: self.metadata.shape[0]} ) return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())] def _remove_na_values(self, cut_off): if ( self.preprocessing_info.get("Missing values were removed") - and self.preprocessing_info.get("Data completeness cut-off") == cut_off + and self.preprocessing_info.get( + PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF + ) + == cut_off ): logging.info("Missing values have already been filtered.") st.warning( @@ -86,7 +107,7 @@ def _remove_na_values(self, cut_off): "Number of removed ProteinGroups due to data completeness cutoff": num_proteins - self.mat.shape[1], "Missing values were removed": True, - "Data completeness cut-off": cut_off, + PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: cut_off, } ) @@ -95,7 +116,7 @@ def _filter(self): logging.info("No columns to filter.") return - if self.preprocessing_info.get("Contaminations have been removed"): + if self.preprocessing_info.get(PreprocessingStateKeys.CONTAMINATIONS_REMOVED): logging.info("Contaminatons have already been filtered.") return @@ -113,11 +134,11 @@ def _filter(self): self.preprocessing_info.update( { - "Number of removed ProteinGroups due to contaminaton": len( + PreprocessingStateKeys.NUM_REMOVED_PG_DUE_TO_CONTAMINATION: len( protein_groups_to_remove ), - "Contaminations have been removed": True, - "Matrix: Number of ProteinIDs/ProteinGroups": self.mat.shape[1], + PreprocessingStateKeys.CONTAMINATIONS_REMOVED: True, + PreprocessingStateKeys.NUM_PG: self.mat.shape[1], } ) @@ -177,7 +198,7 @@ def _imputation(self, method: str): self.mat = pd.DataFrame( imputation_array, index=self.mat.index, columns=self.mat.columns ) - self.preprocessing_info.update({"Imputation": method}) + self.preprocessing_info.update({PreprocessingStateKeys.IMPUTATION: method}) def _linear_normalization(self, dataframe: pd.DataFrame): """Normalize data using l2 norm without breaking when encoutering nones @@ -227,7 +248,7 @@ def _normalization(self, method: str): normalized_array, index=self.mat.index, columns=self.mat.columns ) - self.preprocessing_info.update({"Normalization": method}) + self.preprocessing_info.update({PreprocessingStateKeys.NORMALIZATION: method}) # TODO this needs to be reimplemented # @ignore_warning(RuntimeWarning) @@ -267,7 +288,7 @@ def _normalization(self, method: str): def _log2_transform(self): self.mat = np.log2(self.mat + 0.1) - self.preprocessing_info.update({"Log2-transformed": True}) + self.preprocessing_info.update({PreprocessingStateKeys.LOG2_TRANSFORMED: True}) print("Data has been log2-transformed.") def batch_correction(self, batch: str) -> pd.DataFrame: @@ -364,7 +385,11 @@ def preprocess( if data_completeness > 0: self._remove_na_values(cut_off=data_completeness) - if log2_transform and self.preprocessing_info.get("Log2-transformed") is False: + if ( + log2_transform + and self.preprocessing_info.get(PreprocessingStateKeys.LOG2_TRANSFORMED) + is False + ): self._log2_transform() if normalization is not None: @@ -380,7 +405,7 @@ def preprocess( self.preprocessing_info.update( { - "Matrix: Number of ProteinIDs/ProteinGroups": self.mat.shape[1], + PreprocessingStateKeys.NUM_PG: self.mat.shape[1], } ) diff --git a/alphastats/DataSet_Statistics.py b/alphastats/DataSet_Statistics.py index 907cd531..ed4a47fc 100644 --- a/alphastats/DataSet_Statistics.py +++ b/alphastats/DataSet_Statistics.py @@ -1,11 +1,9 @@ -from codecs import ignore_errors -from itertools import permutations import pandas as pd -import scipy.stats import numpy as np import pingouin + +from alphastats.DataSet_Preprocess import PreprocessingStateKeys from alphastats.utils import ignore_warning -from tqdm import tqdm from functools import lru_cache from typing import Union @@ -22,7 +20,7 @@ def _calculate_foldchange( ) -> pd.DataFrame: mat_transpose += 0.00001 - if self.preprocessing_info["Log2-transformed"]: + if self.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED]: fc = ( mat_transpose[group1_samples].T.mean().values - mat_transpose[group2_samples].T.mean().values diff --git a/alphastats/__init__.py b/alphastats/__init__.py index a7902acd..633cf4af 100644 --- a/alphastats/__init__.py +++ b/alphastats/__init__.py @@ -39,10 +39,11 @@ "development": "requirements_development.txt", } +# TODO get rid of these imports from .loader.AlphaPeptLoader import * from .loader.DIANNLoader import * from .loader.FragPipeLoader import * from .loader.MaxQuantLoader import * -from .DataSet import * +from .loader.SpectronautLoader import * from .cli import * import alphastats.gui diff --git a/alphastats/gui/pages/02_Import Data.py b/alphastats/gui/pages/02_Import Data.py index bfa32263..cfa75fa4 100644 --- a/alphastats/gui/pages/02_Import Data.py +++ b/alphastats/gui/pages/02_Import Data.py @@ -2,7 +2,8 @@ import streamlit as st -from alphastats import DataSet, BaseLoader +from alphastats.DataSet import DataSet +from alphastats import BaseLoader from alphastats.gui.utils.options import SOFTWARE_OPTIONS from alphastats.gui.utils.import_helper import ( diff --git a/alphastats/gui/utils/overview_helper.py b/alphastats/gui/utils/overview_helper.py index c5678cb9..f19631ec 100644 --- a/alphastats/gui/utils/overview_helper.py +++ b/alphastats/gui/utils/overview_helper.py @@ -1,7 +1,8 @@ import streamlit as st import pandas as pd -from alphastats import DataSet +from alphastats.DataSet_Preprocess import PreprocessingStateKeys +from alphastats.DataSet import DataSet from alphastats.gui.utils.ui_helper import convert_df @@ -33,11 +34,23 @@ def get_display_matrix(): def display_matrix(): text = ( "Normalization: " - + str(st.session_state.dataset.preprocessing_info["Normalization"]) + + str( + st.session_state.dataset.preprocessing_info[ + PreprocessingStateKeys.NORMALIZATION + ] + ) + ", Imputation: " - + str(st.session_state.dataset.preprocessing_info["Imputation"]) + + str( + st.session_state.dataset.preprocessing_info[ + PreprocessingStateKeys.IMPUTATION + ] + ) + ", Log2-transformed: " - + str(st.session_state.dataset.preprocessing_info["Log2-transformed"]) + + str( + st.session_state.dataset.preprocessing_info[ + PreprocessingStateKeys.LOG2_TRANSFORMED + ] + ) ) st.markdown("**DataFrame used for analysis** *preview*") diff --git a/alphastats/gui/utils/preprocessing_helper.py b/alphastats/gui/utils/preprocessing_helper.py index 2e4f7d23..da9e53d7 100644 --- a/alphastats/gui/utils/preprocessing_helper.py +++ b/alphastats/gui/utils/preprocessing_helper.py @@ -4,7 +4,7 @@ import pandas as pd from st_cytoscape import cytoscape -from alphastats import DataSet +from alphastats.DataSet import DataSet CYTOSCAPE_STYLESHEET = [ { diff --git a/alphastats/plots/VolcanoPlot.py b/alphastats/plots/VolcanoPlot.py index e3e3f1a5..7ba930c3 100644 --- a/alphastats/plots/VolcanoPlot.py +++ b/alphastats/plots/VolcanoPlot.py @@ -1,5 +1,6 @@ +from alphastats.DataSet_Preprocess import PreprocessingStateKeys from alphastats.plots.PlotUtils import PlotUtils, plotly_object -from alphastats.utils import ignore_warning, check_for_missing_values +from alphastats.utils import ignore_warning import numpy as np import pandas as pd @@ -158,7 +159,10 @@ def _sam(self): transposed = self.dataset.mat.transpose() - if self.dataset.preprocessing_info["Normalization"] is None: + if ( + self.dataset.preprocessing_info[PreprocessingStateKeys.NORMALIZATION] + is None + ): # needs to be lpog2 transformed for fold change calculations transposed = transposed.transform(lambda x: np.log2(x)) diff --git a/alphastats/statistics/DifferentialExpressionAnalysis.py b/alphastats/statistics/DifferentialExpressionAnalysis.py index 0a6735c9..698ab032 100644 --- a/alphastats/statistics/DifferentialExpressionAnalysis.py +++ b/alphastats/statistics/DifferentialExpressionAnalysis.py @@ -4,6 +4,8 @@ import scipy from typing import Union +from alphastats.DataSet_Preprocess import PreprocessingStateKeys + class DifferentialExpressionAnalysis: def __init__( @@ -99,7 +101,10 @@ def sam(self) -> pd.DataFrame: transposed = self.dataset.mat.transpose() - if self.dataset.preprocessing_info["Normalization"] is None: + if ( + self.dataset.preprocessing_info[PreprocessingStateKeys.NORMALIZATION] + is None + ): # needs to be lpog2 transformed for fold change calculations transposed = transposed.transform(lambda x: np.log2(x)) @@ -230,7 +235,7 @@ def _calculate_foldchange( ): mat_transpose += 0.00001 - if self.dataset.preprocessing_info["Log2-transformed"]: + if self.dataset.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED]: fc = ( mat_transpose[group1_samples].T.mean().values - mat_transpose[group2_samples].T.mean().values diff --git a/tests/gui/test_04_preprocessing.py b/tests/gui/test_04_preprocessing.py index 7c0e6f1d..7300804b 100644 --- a/tests/gui/test_04_preprocessing.py +++ b/tests/gui/test_04_preprocessing.py @@ -1,10 +1,7 @@ -from alphastats import DataSet +from alphastats.DataSet import DataSet from alphastats.load_data import load_data from streamlit.testing.v1 import AppTest from pathlib import Path -from unittest.mock import MagicMock, patch -import pandas as pd -from io import BytesIO APP_FOLDER = Path(__file__).parent / Path("../../alphastats/gui/") diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py index 3ff258d2..127ae94d 100644 --- a/tests/test_DataSet.py +++ b/tests/test_DataSet.py @@ -10,6 +10,7 @@ import os import copy +from alphastats.DataSet_Preprocess import PreprocessingStateKeys from alphastats.loader.DIANNLoader import DIANNLoader from alphastats.loader.MaxQuantLoader import MaxQuantLoader from alphastats.loader.AlphaPeptLoader import AlphaPeptLoader @@ -125,12 +126,16 @@ def test_preprocess_filter_already_filter(self, mock): # is info printed if contamination columns get removed # is the new matrix smaller than the older matrix self.assertFalse( - self.obj.preprocessing_info.get("Contaminations have been removed") + self.obj.preprocessing_info.get( + PreprocessingStateKeys.CONTAMINATIONS_REMOVED + ) ) self.obj.preprocess(remove_contaminations=True) self.assertEqual(self.obj.mat.shape, self.matrix_dim_filtered) self.assertTrue( - self.obj.preprocessing_info.get("Contaminations have been removed") + self.obj.preprocessing_info.get( + PreprocessingStateKeys.CONTAMINATIONS_REMOVED + ) ) self.obj.preprocess(remove_contaminations=True) self.assertEqual(self.obj.mat.shape, self.matrix_dim_filtered) From ebf15418e51ac8a9a53c42ee25f40b5fc5510b44 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Tue, 17 Sep 2024 12:54:34 +0200 Subject: [PATCH 2/4] introduce previously missed PreprocessingKeys --- alphastats/DataSet.py | 2 ++ alphastats/DataSet_Preprocess.py | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index cae9ec15..5a77dfa8 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -276,6 +276,8 @@ def _save_dataset_info(self): PreprocessingStateKeys.CONTAMINATION_COLUMNS: self.filter_columns, PreprocessingStateKeys.NUM_REMOVED_PG_DUE_TO_CONTAMINATION: 0, PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: 0, + PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: 0, + PreprocessingStateKeys.MISSING_VALUES_REMOVED: False, } return preprocessing_dict diff --git a/alphastats/DataSet_Preprocess.py b/alphastats/DataSet_Preprocess.py index 04f74082..107b1745 100644 --- a/alphastats/DataSet_Preprocess.py +++ b/alphastats/DataSet_Preprocess.py @@ -29,6 +29,10 @@ class PreprocessingStateKeys: "Number of removed ProteinGroups due to contaminaton" ) DATA_COMPLETENESS_CUTOFF = "Data completeness cut-off" + NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF = ( + "Number of removed ProteinGroups due to data completeness cutoff" + ) + MISSING_VALUES_REMOVED = "Missing values were removed" class Preprocess: @@ -69,7 +73,7 @@ def _subset(self): def _remove_na_values(self, cut_off): if ( - self.preprocessing_info.get("Missing values were removed") + self.preprocessing_info.get(PreprocessingStateKeys.MISSING_VALUES_REMOVED) and self.preprocessing_info.get( PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF ) @@ -104,9 +108,9 @@ def _remove_na_values(self, cut_off): self.preprocessing_info.update( { - "Number of removed ProteinGroups due to data completeness cutoff": num_proteins + PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: num_proteins - self.mat.shape[1], - "Missing values were removed": True, + PreprocessingStateKeys.MISSING_VALUES_REMOVED: True, PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: cut_off, } ) From d0e5e23a9b8af4c0b23f6eba1f6c87ade3f609c0 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Tue, 17 Sep 2024 13:21:23 +0200 Subject: [PATCH 3/4] fix tests --- nbs/getting_started.ipynb | 2 +- nbs/liu_2019.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nbs/getting_started.ipynb b/nbs/getting_started.ipynb index de914369..5d265f86 100644 --- a/nbs/getting_started.ipynb +++ b/nbs/getting_started.ipynb @@ -713,7 +713,7 @@ } ], "source": [ - "ds = alphastats.DataSet(\n", + "ds = alphastats.DataSet.DataSet(\n", " loader = maxquant_data, \n", " metadata_path = \"../testfiles/maxquant/metadata.xlsx\",\n", " sample_column = \"sample\" # specify the column that corresponds to the sample names in proteinGroups\n", diff --git a/nbs/liu_2019.ipynb b/nbs/liu_2019.ipynb index 97296879..0595029f 100644 --- a/nbs/liu_2019.ipynb +++ b/nbs/liu_2019.ipynb @@ -87,7 +87,7 @@ " index_column=\"Gene names\",\n", " gene_names_column=None\n", ")\n", - "dataset = alphastats.DataSet(\n", + "dataset = alphastats.DataSet.DataSet(\n", " loader = loader, \n", " metadata_path=\"../testfiles/maxquant/metadata.xlsx\", \n", " sample_column=\"sample\"\n", From 8e572e3334a01dae45fd431f897a9f84b0839a8a Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Tue, 17 Sep 2024 15:01:59 +0200 Subject: [PATCH 4/4] fix notebooks --- docs/functions.rst | 10 ---------- docs/import_data.md | 2 +- nbs/getting_started.ipynb | 5 ++++- nbs/liu_2019.ipynb | 5 ++++- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/docs/functions.rst b/docs/functions.rst index f8679473..fe7feb61 100644 --- a/docs/functions.rst +++ b/docs/functions.rst @@ -87,13 +87,3 @@ All GO-analysis functions will return a DataFrame with the results. * Plot Scatterplot with -log10(p-value) on x-axis and effect size on y-axis. `df.plot_scatter()` * Plot p-values as Barplot `df.plot_bar` - - -Misc ------- - -Get an overview over your dataset - -* :py:meth:`~alphastats.DataSet.overview` - -* :py:meth:`~alphastats.DataSet_Preprocess.Preprocess.preprocess_print_info` diff --git a/docs/import_data.md b/docs/import_data.md index 6b498175..729129bb 100644 --- a/docs/import_data.md +++ b/docs/import_data.md @@ -115,7 +115,7 @@ To compare samples across various conditions in the downstream analysis, a metad ## Creating a DataSet -The whole downstream analysis can be perforemd on the alphastats.DataSet. To create the DataSet you need to provide the loader object as well as the metadata. +The whole downstream analysis can be performed on the alphastats.DataSet. To create the DataSet you need to provide the loader object as well as the metadata. ```python import alphastats diff --git a/nbs/getting_started.ipynb b/nbs/getting_started.ipynb index 5d265f86..857e7707 100644 --- a/nbs/getting_started.ipynb +++ b/nbs/getting_started.ipynb @@ -9,6 +9,9 @@ "source": [ "import pandas as pd\n", "import warnings\n", + "\n", + "from alphastats.DataSet import DataSet\n", + "\n", "warnings.filterwarnings(\"ignore\", category=UserWarning, module=\"openpyxl\") # remove warning from mac" ] }, @@ -713,7 +716,7 @@ } ], "source": [ - "ds = alphastats.DataSet.DataSet(\n", + "ds = DataSet(\n", " loader = maxquant_data, \n", " metadata_path = \"../testfiles/maxquant/metadata.xlsx\",\n", " sample_column = \"sample\" # specify the column that corresponds to the sample names in proteinGroups\n", diff --git a/nbs/liu_2019.ipynb b/nbs/liu_2019.ipynb index 0595029f..8a8abff9 100644 --- a/nbs/liu_2019.ipynb +++ b/nbs/liu_2019.ipynb @@ -20,6 +20,9 @@ "source": [ "import alphastats\n", "import plotly.io as pio\n", + "\n", + "from alphastats.DataSet import DataSet\n", + "\n", "pio.renderers.default = \"plotly_mimetype+notebook\" " ] }, @@ -87,7 +90,7 @@ " index_column=\"Gene names\",\n", " gene_names_column=None\n", ")\n", - "dataset = alphastats.DataSet.DataSet(\n", + "dataset = DataSet(\n", " loader = loader, \n", " metadata_path=\"../testfiles/maxquant/metadata.xlsx\", \n", " sample_column=\"sample\"\n",