Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce string constants #324

Merged
merged 5 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 15 additions & 13 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


from alphastats.DataSet_Plot import Plot
from alphastats.DataSet_Preprocess import Preprocess
from alphastats.DataSet_Preprocess import Preprocess, PreprocessingStateKeys
from alphastats.DataSet_Pathway import Enrichment
from alphastats.DataSet_Statistics import Statistics
from alphastats.utils import LoaderError
Expand Down Expand Up @@ -188,7 +188,7 @@ def _remove_misc_samples_in_metadata(self):
def _subset(self):
# filter matrix so only samples that are described in metadata are also found in matrix
self.preprocessing_info.update(
{"Matrix: Number of samples": self.metadata.shape[0]}
{PreprocessingStateKeys.NUM_SAMPLES: self.metadata.shape[0]}
)
return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())]

Expand Down Expand Up @@ -265,17 +265,19 @@ def load_metadata(self, file_path: Union[pd.DataFrame, str]) -> pd.DataFrame:
def _save_dataset_info(self):
n_proteingroups = self.mat.shape[1]
preprocessing_dict = {
"Raw data number of Protein Groups": n_proteingroups,
"Matrix: Number of ProteinIDs/ProteinGroups": self.mat.shape[1],
"Matrix: Number of samples": self.mat.shape[0],
"Intensity used for analysis": self.intensity_column,
"Log2-transformed": False,
"Normalization": None,
"Imputation": None,
"Contaminations have been removed": False,
"Contamination columns": self.filter_columns,
"Number of removed ProteinGroups due to contaminaton": 0,
"Data completeness cut-off": 0,
PreprocessingStateKeys.RAW_DATA_NUM_PG: n_proteingroups,
PreprocessingStateKeys.NUM_PG: self.mat.shape[1],
PreprocessingStateKeys.NUM_SAMPLES: self.mat.shape[0],
PreprocessingStateKeys.INTENSITY_COLUMN: self.intensity_column,
PreprocessingStateKeys.LOG2_TRANSFORMED: False,
PreprocessingStateKeys.NORMALIZATION: None,
PreprocessingStateKeys.IMPUTATION: None,
PreprocessingStateKeys.CONTAMINATIONS_REMOVED: False,
PreprocessingStateKeys.CONTAMINATION_COLUMNS: self.filter_columns,
PreprocessingStateKeys.NUM_REMOVED_PG_DUE_TO_CONTAMINATION: 0,
PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: 0,
PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: 0,
PreprocessingStateKeys.MISSING_VALUES_REMOVED: False,
}
return preprocessing_dict

Expand Down
59 changes: 44 additions & 15 deletions alphastats/DataSet_Preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,28 @@
from alphastats.utils import ignore_warning


class PreprocessingStateKeys:
JuliaS92 marked this conversation as resolved.
Show resolved Hide resolved
"""Keys for accessing the dictionary holding the information about preprocessing."""

RAW_DATA_NUM_PG = "Raw data number of Protein Groups"
NUM_PG = "Matrix= Number of ProteinIDs/ProteinGroups"
NUM_SAMPLES = "Matrix= Number of samples"
INTENSITY_COLUMN = "Intensity used for analysis"
LOG2_TRANSFORMED = "Log2-transformed"
NORMALIZATION = "Normalization"
IMPUTATION = "Imputation"
CONTAMINATIONS_REMOVED = "Contaminations have been removed"
CONTAMINATION_COLUMNS = "Contamination columns"
NUM_REMOVED_PG_DUE_TO_CONTAMINATION = (
"Number of removed ProteinGroups due to contaminaton"
)
DATA_COMPLETENESS_CUTOFF = "Data completeness cut-off"
NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF = (
"Number of removed ProteinGroups due to data completeness cutoff"
)
MISSING_VALUES_REMOVED = "Missing values were removed"


class Preprocess:
imputation_methods = ["mean", "median", "knn", "randomforest"]
normalization_methods = ["vst", "zscore", "quantile"]
Expand Down Expand Up @@ -45,14 +67,17 @@ def _remove_samples(self, sample_list: list):
def _subset(self):
# filter matrix so only samples that are described in metadata are also found in matrix
self.preprocessing_info.update(
{"Matrix: Number of samples": self.metadata.shape[0]}
{PreprocessingStateKeys.NUM_SAMPLES: self.metadata.shape[0]}
)
return self.mat[self.mat.index.isin(self.metadata[self.sample].tolist())]

def _remove_na_values(self, cut_off):
if (
self.preprocessing_info.get("Missing values were removed")
and self.preprocessing_info.get("Data completeness cut-off") == cut_off
self.preprocessing_info.get(PreprocessingStateKeys.MISSING_VALUES_REMOVED)
and self.preprocessing_info.get(
PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF
)
== cut_off
):
logging.info("Missing values have already been filtered.")
st.warning(
Expand Down Expand Up @@ -83,10 +108,10 @@ def _remove_na_values(self, cut_off):

self.preprocessing_info.update(
{
"Number of removed ProteinGroups due to data completeness cutoff": num_proteins
PreprocessingStateKeys.NUM_PG_REMOVED_DUE_TO_DATA_COMPLETENESS_CUTOFF: num_proteins
- self.mat.shape[1],
"Missing values were removed": True,
"Data completeness cut-off": cut_off,
PreprocessingStateKeys.MISSING_VALUES_REMOVED: True,
PreprocessingStateKeys.DATA_COMPLETENESS_CUTOFF: cut_off,
}
)

Expand All @@ -95,7 +120,7 @@ def _filter(self):
logging.info("No columns to filter.")
return

if self.preprocessing_info.get("Contaminations have been removed"):
if self.preprocessing_info.get(PreprocessingStateKeys.CONTAMINATIONS_REMOVED):
logging.info("Contaminatons have already been filtered.")
return

Expand All @@ -113,11 +138,11 @@ def _filter(self):

self.preprocessing_info.update(
{
"Number of removed ProteinGroups due to contaminaton": len(
PreprocessingStateKeys.NUM_REMOVED_PG_DUE_TO_CONTAMINATION: len(
protein_groups_to_remove
),
"Contaminations have been removed": True,
"Matrix: Number of ProteinIDs/ProteinGroups": self.mat.shape[1],
PreprocessingStateKeys.CONTAMINATIONS_REMOVED: True,
PreprocessingStateKeys.NUM_PG: self.mat.shape[1],
}
)

Expand Down Expand Up @@ -177,7 +202,7 @@ def _imputation(self, method: str):
self.mat = pd.DataFrame(
imputation_array, index=self.mat.index, columns=self.mat.columns
)
self.preprocessing_info.update({"Imputation": method})
self.preprocessing_info.update({PreprocessingStateKeys.IMPUTATION: method})

def _linear_normalization(self, dataframe: pd.DataFrame):
"""Normalize data using l2 norm without breaking when encoutering nones
Expand Down Expand Up @@ -227,7 +252,7 @@ def _normalization(self, method: str):
normalized_array, index=self.mat.index, columns=self.mat.columns
)

self.preprocessing_info.update({"Normalization": method})
self.preprocessing_info.update({PreprocessingStateKeys.NORMALIZATION: method})

# TODO this needs to be reimplemented
# @ignore_warning(RuntimeWarning)
Expand Down Expand Up @@ -267,7 +292,7 @@ def _normalization(self, method: str):

def _log2_transform(self):
self.mat = np.log2(self.mat + 0.1)
self.preprocessing_info.update({"Log2-transformed": True})
self.preprocessing_info.update({PreprocessingStateKeys.LOG2_TRANSFORMED: True})
print("Data has been log2-transformed.")

def batch_correction(self, batch: str) -> pd.DataFrame:
Expand Down Expand Up @@ -364,7 +389,11 @@ def preprocess(
if data_completeness > 0:
self._remove_na_values(cut_off=data_completeness)

if log2_transform and self.preprocessing_info.get("Log2-transformed") is False:
if (
log2_transform
and self.preprocessing_info.get(PreprocessingStateKeys.LOG2_TRANSFORMED)
is False
):
self._log2_transform()

if normalization is not None:
Expand All @@ -380,7 +409,7 @@ def preprocess(

self.preprocessing_info.update(
{
"Matrix: Number of ProteinIDs/ProteinGroups": self.mat.shape[1],
PreprocessingStateKeys.NUM_PG: self.mat.shape[1],
}
)

Expand Down
8 changes: 3 additions & 5 deletions alphastats/DataSet_Statistics.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from codecs import ignore_errors
from itertools import permutations
import pandas as pd
import scipy.stats
import numpy as np
import pingouin

from alphastats.DataSet_Preprocess import PreprocessingStateKeys
from alphastats.utils import ignore_warning
from tqdm import tqdm
from functools import lru_cache
from typing import Union

Expand All @@ -22,7 +20,7 @@ def _calculate_foldchange(
) -> pd.DataFrame:
mat_transpose += 0.00001

if self.preprocessing_info["Log2-transformed"]:
if self.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED]:
fc = (
mat_transpose[group1_samples].T.mean().values
- mat_transpose[group2_samples].T.mean().values
Expand Down
3 changes: 2 additions & 1 deletion alphastats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,11 @@
"development": "requirements_development.txt",
}

# TODO get rid of these imports
from .loader.AlphaPeptLoader import *
from .loader.DIANNLoader import *
from .loader.FragPipeLoader import *
from .loader.MaxQuantLoader import *
from .DataSet import *
from .loader.SpectronautLoader import *
from .cli import *
import alphastats.gui
3 changes: 2 additions & 1 deletion alphastats/gui/pages/02_Import Data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import streamlit as st

from alphastats import DataSet, BaseLoader
from alphastats.DataSet import DataSet
from alphastats import BaseLoader
from alphastats.gui.utils.options import SOFTWARE_OPTIONS

from alphastats.gui.utils.import_helper import (
Expand Down
21 changes: 17 additions & 4 deletions alphastats/gui/utils/overview_helper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import streamlit as st
import pandas as pd

from alphastats import DataSet
from alphastats.DataSet_Preprocess import PreprocessingStateKeys
from alphastats.DataSet import DataSet
from alphastats.gui.utils.ui_helper import convert_df


Expand Down Expand Up @@ -33,11 +34,23 @@ def get_display_matrix():
def display_matrix():
text = (
"Normalization: "
+ str(st.session_state.dataset.preprocessing_info["Normalization"])
+ str(
st.session_state.dataset.preprocessing_info[
PreprocessingStateKeys.NORMALIZATION
]
)
+ ", Imputation: "
+ str(st.session_state.dataset.preprocessing_info["Imputation"])
+ str(
st.session_state.dataset.preprocessing_info[
PreprocessingStateKeys.IMPUTATION
]
)
+ ", Log2-transformed: "
+ str(st.session_state.dataset.preprocessing_info["Log2-transformed"])
+ str(
st.session_state.dataset.preprocessing_info[
PreprocessingStateKeys.LOG2_TRANSFORMED
]
)
)

st.markdown("**DataFrame used for analysis** *preview*")
Expand Down
2 changes: 1 addition & 1 deletion alphastats/gui/utils/preprocessing_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
from st_cytoscape import cytoscape

from alphastats import DataSet
from alphastats.DataSet import DataSet

CYTOSCAPE_STYLESHEET = [
{
Expand Down
8 changes: 6 additions & 2 deletions alphastats/plots/VolcanoPlot.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from alphastats.DataSet_Preprocess import PreprocessingStateKeys
from alphastats.plots.PlotUtils import PlotUtils, plotly_object
from alphastats.utils import ignore_warning, check_for_missing_values
from alphastats.utils import ignore_warning

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -158,7 +159,10 @@ def _sam(self):

transposed = self.dataset.mat.transpose()

if self.dataset.preprocessing_info["Normalization"] is None:
if (
self.dataset.preprocessing_info[PreprocessingStateKeys.NORMALIZATION]
is None
):
# needs to be lpog2 transformed for fold change calculations
transposed = transposed.transform(lambda x: np.log2(x))

Expand Down
9 changes: 7 additions & 2 deletions alphastats/statistics/DifferentialExpressionAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import scipy
from typing import Union

from alphastats.DataSet_Preprocess import PreprocessingStateKeys


class DifferentialExpressionAnalysis:
def __init__(
Expand Down Expand Up @@ -99,7 +101,10 @@ def sam(self) -> pd.DataFrame:

transposed = self.dataset.mat.transpose()

if self.dataset.preprocessing_info["Normalization"] is None:
if (
self.dataset.preprocessing_info[PreprocessingStateKeys.NORMALIZATION]
is None
):
# needs to be lpog2 transformed for fold change calculations
transposed = transposed.transform(lambda x: np.log2(x))

Expand Down Expand Up @@ -230,7 +235,7 @@ def _calculate_foldchange(
):
mat_transpose += 0.00001

if self.dataset.preprocessing_info["Log2-transformed"]:
if self.dataset.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED]:
fc = (
mat_transpose[group1_samples].T.mean().values
- mat_transpose[group2_samples].T.mean().values
Expand Down
10 changes: 0 additions & 10 deletions docs/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,3 @@ All GO-analysis functions will return a DataFrame with the results.

* Plot Scatterplot with -log10(p-value) on x-axis and effect size on y-axis. `df.plot_scatter()`
* Plot p-values as Barplot `df.plot_bar`


Misc
------

Get an overview over your dataset

* :py:meth:`~alphastats.DataSet.overview`

* :py:meth:`~alphastats.DataSet_Preprocess.Preprocess.preprocess_print_info`
2 changes: 1 addition & 1 deletion docs/import_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ To compare samples across various conditions in the downstream analysis, a metad

## Creating a DataSet

The whole downstream analysis can be perforemd on the alphastats.DataSet. To create the DataSet you need to provide the loader object as well as the metadata.
The whole downstream analysis can be performed on the alphastats.DataSet. To create the DataSet you need to provide the loader object as well as the metadata.

```python
import alphastats
Expand Down
5 changes: 4 additions & 1 deletion nbs/getting_started.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
"source": [
"import pandas as pd\n",
"import warnings\n",
"\n",
"from alphastats.DataSet import DataSet\n",
"\n",
"warnings.filterwarnings(\"ignore\", category=UserWarning, module=\"openpyxl\") # remove warning from mac"
]
},
Expand Down Expand Up @@ -713,7 +716,7 @@
}
],
"source": [
"ds = alphastats.DataSet(\n",
"ds = DataSet(\n",
" loader = maxquant_data, \n",
" metadata_path = \"../testfiles/maxquant/metadata.xlsx\",\n",
" sample_column = \"sample\" # specify the column that corresponds to the sample names in proteinGroups\n",
Expand Down
5 changes: 4 additions & 1 deletion nbs/liu_2019.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
"source": [
"import alphastats\n",
"import plotly.io as pio\n",
"\n",
"from alphastats.DataSet import DataSet\n",
"\n",
"pio.renderers.default = \"plotly_mimetype+notebook\" "
]
},
Expand Down Expand Up @@ -87,7 +90,7 @@
" index_column=\"Gene names\",\n",
" gene_names_column=None\n",
")\n",
"dataset = alphastats.DataSet(\n",
"dataset = DataSet(\n",
" loader = loader, \n",
" metadata_path=\"../testfiles/maxquant/metadata.xlsx\", \n",
" sample_column=\"sample\"\n",
Expand Down
Loading
Loading