diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index afeaf3f1..5a77dfa8 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -10,7 +10,7 @@ from alphastats.DataSet_Plot import Plot -from alphastats.DataSet_Preprocess import Preprocess +from alphastats.DataSet_Preprocess import Preprocess, PreprocessingStateKeys from alphastats.DataSet_Pathway import Enrichment from alphastats.DataSet_Statistics import Statistics from alphastats.utils import LoaderError @@ -188,7 +188,7 @@ def _remove_misc_samples_in_metadata(self): def _subset(self): # filter matrix so only samples that are described in metadata are also found in matrix self.preprocessing_info.update( - {"Matrix: Number of samples": self.metadata.shape[0]} + {PreprocessingStateKeys.NUM_SAMPLES: self.metadata.shape[0]} ) return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())] @@ -265,17 +265,19 @@ def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame: def _save_dataset_info(self): n_proteingroups = self.mat.shape[1] preprocessing_dict = { - "Raw data number of Protein Groups": n_proteingroups, - "Matrix: Number of ProteinIDs/ProteinGroups": self.mat.shape[1], - "Matrix: Number of samples": self.mat.shape[0], - "Intensity used for analysis": self.intensity_column, - "Log2-transformed": False, - "Normalization": None, - "Imputation": None, - "Contaminations have been removed": False, - "Contamination columns": self.filter_columns, - "Number of removed ProteinGroups due to contaminaton": 0, - "Data completeness cut-off": 0, + PreprocessingStateKeys.RAW_DATA_NUM_PG: n_proteingroups, + PreprocessingStateKeys.NUM_PG: self.mat.shape[1], + PreprocessingStateKeys.NUM_SAMPLES: self.mat.shape[0], + PreprocessingStateKeys.INTENSITY_COLUMN: self.intensity_column, + PreprocessingStateKeys.LOG2_TRANSFORMED: False, + PreprocessingStateKeys.NORMALIZATION: None, + PreprocessingStateKeys.IMPUTATION: None, + PreprocessingStateKeys.CONTAMINATIONS_REMOVED: False, + PreprocessingStateKeys.CONTAMINATION_COLUMNS: self.filter_columns, + PreprocessingStateKeys.NUM_REMOVED_PG_DUE_TO_CONTAMINATION: 0, + PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: 0, + PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: 0, + PreprocessingStateKeys.MISSING_VALUES_REMOVED: False, } return preprocessing_dict diff --git a/alphastats/DataSet_Preprocess.py b/alphastats/DataSet_Preprocess.py index de5fcf5c..029708c4 100644 --- a/alphastats/DataSet_Preprocess.py +++ b/alphastats/DataSet_Preprocess.py @@ -13,6 +13,28 @@ from alphastats.utils import ignore_warning +class PreprocessingStateKeys: + """Keys for accessing the dictionary holding the information about preprocessing.""" + + RAW_DATA_NUM_PG = "Raw data number of Protein Groups" + NUM_PG = "Matrix= Number of ProteinIDs/ProteinGroups" + NUM_SAMPLES = "Matrix= Number of samples" + INTENSITY_COLUMN = "Intensity used for analysis" + LOG2_TRANSFORMED = "Log2-transformed" + NORMALIZATION = "Normalization" + IMPUTATION = "Imputation" + CONTAMINATIONS_REMOVED = "Contaminations have been removed" + CONTAMINATION_COLUMNS = "Contamination columns" + NUM_REMOVED_PG_DUE_TO_CONTAMINATION = ( + "Number of removed ProteinGroups due to contaminaton" + ) + DATA_COMPLETENESS_CUTOFF = "Data completeness cut-off" + NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF = ( + "Number of removed ProteinGroups due to data completeness cutoff" + ) + MISSING_VALUES_REMOVED = "Missing values were removed" + + class Preprocess: imputation_methods = ["mean", "median", "knn", "randomforest"] normalization_methods = ["vst", "zscore", "quantile"] @@ -45,14 +67,17 @@ def _remove_samples(self, sample_list: list): def _subset(self): # filter matrix so only samples that are described in metadata are also found in matrix self.preprocessing_info.update( - {"Matrix: Number of samples": self.metadata.shape[0]} + {PreprocessingStateKeys.NUM_SAMPLES: self.metadata.shape[0]} ) return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())] def _remove_na_values(self, cut_off): if ( - self.preprocessing_info.get("Missing values were removed") - and self.preprocessing_info.get("Data completeness cut-off") == cut_off + self.preprocessing_info.get(PreprocessingStateKeys.MISSING_VALUES_REMOVED) + and self.preprocessing_info.get( + PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF + ) + == cut_off ): logging.info("Missing values have already been filtered.") st.warning( @@ -83,10 +108,10 @@ def _remove_na_values(self, cut_off): self.preprocessing_info.update( { - "Number of removed ProteinGroups due to data completeness cutoff": num_proteins + PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: num_proteins - self.mat.shape[1], - "Missing values were removed": True, - "Data completeness cut-off": cut_off, + PreprocessingStateKeys.MISSING_VALUES_REMOVED: True, + PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: cut_off, } ) @@ -95,7 +120,7 @@ def _filter(self): logging.info("No columns to filter.") return - if self.preprocessing_info.get("Contaminations have been removed"): + if self.preprocessing_info.get(PreprocessingStateKeys.CONTAMINATIONS_REMOVED): logging.info("Contaminatons have already been filtered.") return @@ -113,11 +138,11 @@ def _filter(self): self.preprocessing_info.update( { - "Number of removed ProteinGroups due to contaminaton": len( + PreprocessingStateKeys.NUM_REMOVED_PG_DUE_TO_CONTAMINATION: len( protein_groups_to_remove ), - "Contaminations have been removed": True, - "Matrix: Number of ProteinIDs/ProteinGroups": self.mat.shape[1], + PreprocessingStateKeys.CONTAMINATIONS_REMOVED: True, + PreprocessingStateKeys.NUM_PG: self.mat.shape[1], } ) @@ -177,7 +202,7 @@ def _imputation(self, method: str): self.mat = pd.DataFrame( imputation_array, index=self.mat.index, columns=self.mat.columns ) - self.preprocessing_info.update({"Imputation": method}) + self.preprocessing_info.update({PreprocessingStateKeys.IMPUTATION: method}) def _linear_normalization(self, dataframe: pd.DataFrame): """Normalize data using l2 norm without breaking when encoutering nones @@ -239,7 +264,7 @@ def _normalization(self, method: str) -> None: normalized_array, index=self.mat.index, columns=self.mat.columns ) - self.preprocessing_info.update({"Normalization": method}) + self.preprocessing_info.update({PreprocessingStateKeys.NORMALIZATION: method}) # TODO this needs to be reimplemented # @ignore_warning(RuntimeWarning) @@ -279,7 +304,7 @@ def _normalization(self, method: str) -> None: def _log2_transform(self): self.mat = np.log2(self.mat) - self.preprocessing_info.update({"Log2-transformed": True}) + self.preprocessing_info.update({PreprocessingStateKeys.LOG2_TRANSFORMED: True}) print("Data has been log2-transformed.") def batch_correction(self, batch: str) -> pd.DataFrame: @@ -376,7 +401,11 @@ def preprocess( if data_completeness > 0: self._remove_na_values(cut_off=data_completeness) - if log2_transform and self.preprocessing_info.get("Log2-transformed") is False: + if ( + log2_transform + and self.preprocessing_info.get(PreprocessingStateKeys.LOG2_TRANSFORMED) + is False + ): self._log2_transform() if normalization is not None: @@ -392,7 +421,7 @@ def preprocess( self.preprocessing_info.update( { - "Matrix: Number of ProteinIDs/ProteinGroups": self.mat.shape[1], + PreprocessingStateKeys.NUM_PG: self.mat.shape[1], } ) diff --git a/alphastats/DataSet_Statistics.py b/alphastats/DataSet_Statistics.py index 35526deb..0582b8c6 100644 --- a/alphastats/DataSet_Statistics.py +++ b/alphastats/DataSet_Statistics.py @@ -1,11 +1,9 @@ -from codecs import ignore_errors -from itertools import permutations import pandas as pd -import scipy.stats import numpy as np import pingouin + +from alphastats.DataSet_Preprocess import PreprocessingStateKeys from alphastats.utils import ignore_warning -from tqdm import tqdm from functools import lru_cache from typing import Union @@ -20,7 +18,7 @@ class Statistics: def _calculate_foldchange( self, mat_transpose: pd.DataFrame, group1_samples: list, group2_samples: list ) -> pd.DataFrame: - if self.preprocessing_info["Log2-transformed"]: + if self.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED]: fc = ( mat_transpose[group1_samples].T.mean().values - mat_transpose[group2_samples].T.mean().values diff --git a/alphastats/__init__.py b/alphastats/__init__.py index a7902acd..633cf4af 100644 --- a/alphastats/__init__.py +++ b/alphastats/__init__.py @@ -39,10 +39,11 @@ "development": "requirements_development.txt", } +# TODO get rid of these imports from .loader.AlphaPeptLoader import * from .loader.DIANNLoader import * from .loader.FragPipeLoader import * from .loader.MaxQuantLoader import * -from .DataSet import * +from .loader.SpectronautLoader import * from .cli import * import alphastats.gui diff --git a/alphastats/gui/pages/02_Import Data.py b/alphastats/gui/pages/02_Import Data.py index 389a4fc6..1fcbdeab 100644 --- a/alphastats/gui/pages/02_Import Data.py +++ b/alphastats/gui/pages/02_Import Data.py @@ -2,7 +2,8 @@ import streamlit as st -from alphastats import DataSet, BaseLoader +from alphastats.DataSet import DataSet +from alphastats import BaseLoader from alphastats.gui.utils.options import SOFTWARE_OPTIONS from alphastats.gui.utils.import_helper import ( diff --git a/alphastats/gui/utils/overview_helper.py b/alphastats/gui/utils/overview_helper.py index acdd025c..6a631462 100644 --- a/alphastats/gui/utils/overview_helper.py +++ b/alphastats/gui/utils/overview_helper.py @@ -1,7 +1,8 @@ import streamlit as st import pandas as pd -from alphastats import DataSet +from alphastats.DataSet_Preprocess import PreprocessingStateKeys +from alphastats.DataSet import DataSet from alphastats.gui.utils.ui_helper import convert_df, StateKeys @@ -33,12 +34,22 @@ def get_display_matrix(): def display_matrix(): text = ( "Normalization: " - + str(st.session_state[StateKeys.DATASET].preprocessing_info["Normalization"]) + + str( + st.session_state[StateKeys.DATASET].preprocessing_info[ + PreprocessingStateKeys.NORMALIZATION + ] + ) + ", Imputation: " - + str(st.session_state[StateKeys.DATASET].preprocessing_info["Imputation"]) + + str( + st.session_state[StateKeys.DATASET].preprocessing_info[ + PreprocessingStateKeys.IMPUTATION + ] + ) + ", Log2-transformed: " + str( - st.session_state[StateKeys.DATASET].preprocessing_info["Log2-transformed"] + st.session_state[StateKeys.DATASET].preprocessing_info[ + PreprocessingStateKeys.LOG2_TRANSFORMED + ] ) ) diff --git a/alphastats/gui/utils/preprocessing_helper.py b/alphastats/gui/utils/preprocessing_helper.py index 2e4f7d23..da9e53d7 100644 --- a/alphastats/gui/utils/preprocessing_helper.py +++ b/alphastats/gui/utils/preprocessing_helper.py @@ -4,7 +4,7 @@ import pandas as pd from st_cytoscape import cytoscape -from alphastats import DataSet +from alphastats.DataSet import DataSet CYTOSCAPE_STYLESHEET = [ { diff --git a/alphastats/plots/VolcanoPlot.py b/alphastats/plots/VolcanoPlot.py index dd5ba480..ec09d591 100644 --- a/alphastats/plots/VolcanoPlot.py +++ b/alphastats/plots/VolcanoPlot.py @@ -1,5 +1,6 @@ +from alphastats.DataSet_Preprocess import PreprocessingStateKeys from alphastats.plots.PlotUtils import PlotUtils, plotly_object -from alphastats.utils import ignore_warning, check_for_missing_values +from alphastats.utils import ignore_warning import numpy as np import pandas as pd @@ -158,7 +159,10 @@ def _sam(self): transposed = self.dataset.mat.transpose() - if self.dataset.preprocessing_info["Log2-transformed"] is None: + if ( + self.dataset.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED] + is None + ): # needs to be lpog2 transformed for fold change calculations transposed = transposed.transform(lambda x: np.log2(x)) diff --git a/alphastats/statistics/DifferentialExpressionAnalysis.py b/alphastats/statistics/DifferentialExpressionAnalysis.py index 2dbc3db3..7db03050 100644 --- a/alphastats/statistics/DifferentialExpressionAnalysis.py +++ b/alphastats/statistics/DifferentialExpressionAnalysis.py @@ -4,6 +4,8 @@ import scipy from typing import Union +from alphastats.DataSet_Preprocess import PreprocessingStateKeys + class DifferentialExpressionAnalysis: def __init__( @@ -99,7 +101,10 @@ def sam(self) -> pd.DataFrame: transposed = self.dataset.mat.transpose() - if self.dataset.preprocessing_info["Log2-transformed"] is None: + if ( + self.dataset.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED] + is None + ): # needs to be lpog2 transformed for fold change calculations transposed = transposed.transform(lambda x: np.log2(x)) @@ -228,7 +233,7 @@ def pairedttest(self) -> pd.DataFrame: def _calculate_foldchange( self, mat_transpose: pd.DataFrame, group1_samples: list, group2_samples: list ): - if self.dataset.preprocessing_info["Log2-transformed"]: + if self.dataset.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED]: fc = ( mat_transpose[group1_samples].T.mean().values - mat_transpose[group2_samples].T.mean().values diff --git a/docs/functions.rst b/docs/functions.rst index f8679473..fe7feb61 100644 --- a/docs/functions.rst +++ b/docs/functions.rst @@ -87,13 +87,3 @@ All GO-analysis functions will return a DataFrame with the results. * Plot Scatterplot with -log10(p-value) on x-axis and effect size on y-axis. `df.plot_scatter()` * Plot p-values as Barplot `df.plot_bar` - - -Misc ------- - -Get an overview over your dataset - -* :py:meth:`~alphastats.DataSet.overview` - -* :py:meth:`~alphastats.DataSet_Preprocess.Preprocess.preprocess_print_info` diff --git a/docs/import_data.md b/docs/import_data.md index 6b498175..729129bb 100644 --- a/docs/import_data.md +++ b/docs/import_data.md @@ -115,7 +115,7 @@ To compare samples across various conditions in the downstream analysis, a metad ## Creating a DataSet -The whole downstream analysis can be perforemd on the alphastats.DataSet. To create the DataSet you need to provide the loader object as well as the metadata. +The whole downstream analysis can be performed on the alphastats.DataSet. To create the DataSet you need to provide the loader object as well as the metadata. ```python import alphastats diff --git a/nbs/getting_started.ipynb b/nbs/getting_started.ipynb index de914369..857e7707 100644 --- a/nbs/getting_started.ipynb +++ b/nbs/getting_started.ipynb @@ -9,6 +9,9 @@ "source": [ "import pandas as pd\n", "import warnings\n", + "\n", + "from alphastats.DataSet import DataSet\n", + "\n", "warnings.filterwarnings(\"ignore\", category=UserWarning, module=\"openpyxl\") # remove warning from mac" ] }, @@ -713,7 +716,7 @@ } ], "source": [ - "ds = alphastats.DataSet(\n", + "ds = DataSet(\n", " loader = maxquant_data, \n", " metadata_path = \"../testfiles/maxquant/metadata.xlsx\",\n", " sample_column = \"sample\" # specify the column that corresponds to the sample names in proteinGroups\n", diff --git a/nbs/liu_2019.ipynb b/nbs/liu_2019.ipynb index 97296879..8a8abff9 100644 --- a/nbs/liu_2019.ipynb +++ b/nbs/liu_2019.ipynb @@ -20,6 +20,9 @@ "source": [ "import alphastats\n", "import plotly.io as pio\n", + "\n", + "from alphastats.DataSet import DataSet\n", + "\n", "pio.renderers.default = \"plotly_mimetype+notebook\" " ] }, @@ -87,7 +90,7 @@ " index_column=\"Gene names\",\n", " gene_names_column=None\n", ")\n", - "dataset = alphastats.DataSet(\n", + "dataset = DataSet(\n", " loader = loader, \n", " metadata_path=\"../testfiles/maxquant/metadata.xlsx\", \n", " sample_column=\"sample\"\n", diff --git a/tests/gui/test_04_preprocessing.py b/tests/gui/test_04_preprocessing.py index d10fa2c4..71c1e0ed 100644 --- a/tests/gui/test_04_preprocessing.py +++ b/tests/gui/test_04_preprocessing.py @@ -1,5 +1,9 @@ from streamlit.testing.v1 import AppTest +from pathlib import Path from .conftest import create_dataset_alphapept, APP_FOLDER + +from alphastats.DataSet import DataSet +from alphastats.load_data import load_data from alphastats.gui.utils.ui_helper import StateKeys TESTED_PAGE = f"{APP_FOLDER}/pages/03_Preprocessing.py" diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py index bc59d36a..01d5a275 100644 --- a/tests/test_DataSet.py +++ b/tests/test_DataSet.py @@ -10,6 +10,7 @@ import os import copy +from alphastats.DataSet_Preprocess import PreprocessingStateKeys from alphastats.loader.DIANNLoader import DIANNLoader from alphastats.loader.MaxQuantLoader import MaxQuantLoader from alphastats.loader.AlphaPeptLoader import AlphaPeptLoader @@ -125,12 +126,16 @@ def test_preprocess_filter_already_filter(self, mock): # is info printed if contamination columns get removed # is the new matrix smaller than the older matrix self.assertFalse( - self.obj.preprocessing_info.get("Contaminations have been removed") + self.obj.preprocessing_info.get( + PreprocessingStateKeys.CONTAMINATIONS_REMOVED + ) ) self.obj.preprocess(remove_contaminations=True) self.assertEqual(self.obj.mat.shape, self.matrix_dim_filtered) self.assertTrue( - self.obj.preprocessing_info.get("Contaminations have been removed") + self.obj.preprocessing_info.get( + PreprocessingStateKeys.CONTAMINATIONS_REMOVED + ) ) self.obj.preprocess(remove_contaminations=True) self.assertEqual(self.obj.mat.shape, self.matrix_dim_filtered)