Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Idhandling I #378

Open
wants to merge 14 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 55 additions & 28 deletions alphastats/dataset/dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import defaultdict
from typing import Dict, List, Optional, Tuple, Union

import pandas as pd
Expand Down Expand Up @@ -101,23 +102,11 @@ def __init__(
self.mat: pd.DataFrame = mat
self.metadata: pd.DataFrame = metadata
self.preprocessing_info: Dict = preprocessing_info

self._gene_name_to_protein_id_map = (
{
k: v
for k, v in dict(
zip(
self.rawinput[Cols.GENE_NAMES].tolist(),
self.rawinput[Cols.INDEX].tolist(),
)
).items()
if isinstance(k, str) # avoid having NaN as key
}
if Cols.GENE_NAMES in self.rawinput.columns
else {}
)
# TODO This is not necessarily unique, and should ideally raise an error in some of our test-data sets that
# contain isoform ids. E.g. TPM1 occurs 5 times in testfiles/maxquant/proteinGroups.txt with different base Protein IDs.
(
self._gene_to_features_map,
self._protein_to_features_map,
self._feature_to_repr_map,
) = self._create_id_dicts()

print("DataSet has been created.")

Expand Down Expand Up @@ -161,6 +150,37 @@ def _check_loader(loader):
"Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader"
)

def _create_id_dicts(self, sep: str = ";") -> Tuple[dict, dict, dict]:
"""Create mapprings from gene, protein to feature and from feature to repr."""

features = self.mat.columns.to_list()
gene_to_features_map = defaultdict(lambda: [])
protein_to_features_map = defaultdict(lambda: [])
feature_to_repr_map = defaultdict(lambda x: x)

for proteins, feature in self.rawinput[[Cols.INDEX, Cols.INDEX]].itertuples(
index=False
):
if feature not in features:
continue
# TODO: Shorten list if too many ids e.g. to id1;...(19) if 20 ids are present
feature_to_repr_map[feature] = "ids:" + proteins
for protein in proteins.split(sep):
protein_to_features_map[protein].append(feature)

if Cols.GENE_NAMES in self.rawinput.columns:
for genes, feature in self.rawinput[
[Cols.GENE_NAMES, Cols.INDEX]
].itertuples(index=False):
if feature not in features:
continue
if isinstance(genes, str):
for gene in genes.split(sep):
gene_to_features_map[gene].append(feature)
feature_to_repr_map[feature] = genes

return gene_to_features_map, protein_to_features_map, feature_to_repr_map

def _get_preprocess(self) -> Preprocess:
"""Return instance of the Preprocess object."""
return Preprocess(
Expand Down Expand Up @@ -199,6 +219,11 @@ def preprocess(
**kwargs,
)
)
(
self._gene_to_features_map,
self._protein_to_features_map,
self._feature_to_repr_map,
) = self._create_id_dicts()

def reset_preprocessing(self):
"""Reset all preprocessing steps"""
Expand All @@ -208,6 +233,11 @@ def reset_preprocessing(self):
self.metadata,
self.preprocessing_info,
) = self._get_init_dataset()
(
self._gene_to_features_map,
self._protein_to_features_map,
self._feature_to_repr_map,
) = self._create_id_dicts()

def batch_correction(self, batch: str) -> None:
"""A wrapper for Preprocess.batch_correction(), see documentation there."""
Expand Down Expand Up @@ -434,25 +464,22 @@ def plot_volcano(

return volcano_plot.plot

def _get_protein_id_for_gene_name(
def _get_features_for_gene_name(
self,
gene_name: str,
) -> str:
"""Get protein id from gene id. If gene id is not present, return gene id, as we might already have a gene id.
'VCL;HEL114' -> 'P18206;A0A024QZN4;V9HWK2;B3KXA2;Q5JQ13;B4DKC9;B4DTM7;A0A096LPE1'
) -> list:
"""Get protein groups from gene id. If gene id is not present, return gene id, as we might already have a gene id.
'HEL114' -> ['P18206;A0A024QZN4;V9HWK2;B3KXA2;Q5JQ13;B4DKC9;B4DTM7;A0A096LPE1']

Args:
gene_name (str): Gene name

Returns:
str: Protein id or gene name if not present in the mapping.
list: Protein group ids or gene name if not present in the mapping.
"""
if gene_name in self._gene_name_to_protein_id_map:
return self._gene_name_to_protein_id_map[gene_name]
if gene_name in self._gene_to_features_map:
return self._gene_to_features_map[gene_name]

for gene, protein_id in self._gene_name_to_protein_id_map.items():
if gene_name in gene.split(";"):
return protein_id
return gene_name

def plot_intensity(
Expand Down Expand Up @@ -492,7 +519,7 @@ def plot_intensity(
if gene_name is None and protein_id is not None:
pass
elif gene_name is not None and protein_id is None:
protein_id = self._get_protein_id_for_gene_name(gene_name)
protein_id = self._get_features_for_gene_name(gene_name)
else:
raise ValueError(
"Either protein_id or gene_name must be provided, but not both."
Expand Down
2 changes: 1 addition & 1 deletion alphastats/gui/pages/05_Analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def show_start_llm_button(analysis_method: str) -> None:
st.session_state[StateKeys.LLM_INPUT] = (analysis_object, parameters)

st.toast("LLM analysis created!", icon="✅")
st.page_link("pages/05_LLM.py", label="=> Go to LLM page..")
st.page_link("pages/06_LLM.py", label="=> Go to LLM page..")


if analysis_result is not None:
Expand Down
4 changes: 3 additions & 1 deletion alphastats/loader/generic_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def __init__(
file: Union[str, pd.DataFrame],
intensity_column: list,
index_column: str,
gene_names_column: str = None,
sep: str = None,
):
"""Generic Loader for you proteomics data
Expand All @@ -33,7 +34,8 @@ def __init__(
self.confidence_column = None
self.software = "Generic"
self.evidence_df = None
self.gene_names_column = None
if gene_names_column in self.rawinput.columns.to_list():
self.gene_names_column = gene_names_column
self.ptm_df = None
self._add_contamination_column()
self._check_if_columns_are_present()
Expand Down
18 changes: 18 additions & 0 deletions alphastats/loader/maxquant_loader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from typing import Union

import numpy as np
Expand Down Expand Up @@ -45,6 +46,23 @@ def __init__(
self._set_filter_columns_to_true_false()
self._read_all_column_names_as_string()

intensity_columns = [
col
for col in self.rawinput.columns
if intensity_column.replace("[sample]", "") in col
]
if len(self.rawinput.dropna(subset=intensity_columns, how="all")) != len(
self.rawinput
):
valid_id = re.compile(
"[A-Z]"
) # Assuming that all valid protein ids would contain at least one letter.
self.rawinput = self.rawinput[
self.rawinput[self.index_column].apply(
lambda x: bool(valid_id.match(x))
)
]

if gene_names_column in self.rawinput.columns.to_list():
self.gene_names_column = gene_names_column

Expand Down
38 changes: 21 additions & 17 deletions alphastats/plots/intensity_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import scipy

from alphastats.dataset.keys import Cols
from alphastats.dataset.preprocessing import PreprocessingStateKeys
from alphastats.plots.plot_utils import PlotlyObject, PlotUtils

plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template(
Expand Down Expand Up @@ -42,7 +43,7 @@ def __init__(
preprocessing_info: Dict,
protein_id,
group,
subgroups,
subgroups=None,
method,
add_significance,
log_scale,
Expand All @@ -52,12 +53,15 @@ def __init__(
self.intensity_column = intensity_column
self.preprocessing_info = preprocessing_info

self.protein_id = protein_id
self.protein_id = [protein_id] if isinstance(protein_id, str) else protein_id
self.group = group
self.subgroups = subgroups
self.method = method
self.add_significance = add_significance
self.log_scale = log_scale
self.y_axis = self.intensity_column.replace("[sample]", "").strip()
if self.preprocessing_info[PreprocessingStateKeys.LOG2_TRANSFORMED]:
self.y_axis = "log2(" + self.yaxis + ")"

self.prepared_df = None
self._prepare_data()
Expand Down Expand Up @@ -137,58 +141,58 @@ def _add_significance(plot):
def _prepare_data(self):
# TODO use difflib to find similar ProteinId if ProteinGroup is not present
df = (
self.mat[[self.protein_id]]
.reset_index()
.rename(columns={"index": Cols.SAMPLE})
)
self.mat[self.protein_id].melt(
ignore_index=False,
value_name=self.y_axis,
var_name=Cols.INDEX,
)
).dropna()
df = df.reset_index().rename(columns={"index": Cols.SAMPLE})
df = df.merge(self.metadata, how="inner", on=[Cols.SAMPLE])

if self.subgroups is not None:
df = df[df[self.group].isin(self.subgroups)]

self.y_label = (
self.protein_id + " - " + self.intensity_column.replace("[sample]", "")
)
self.prepared_df = df

def _plot(self):
if self.method == "violin":
fig = px.violin(
self.prepared_df,
y=self.protein_id,
y=self.y_axis,
x=self.group,
facet_col=Cols.INDEX,
color=self.group,
labels={self.protein_id: self.y_label},
template="simple_white+alphastats_colors",
)

elif self.method == "box":
fig = px.box(
self.prepared_df,
y=self.protein_id,
y=self.y_axis,
x=self.group,
facet_col=Cols.INDEX,
color=self.group,
labels={self.protein_id: self.y_label},
template="simple_white+alphastats_colors",
)

elif self.method == "scatter":
fig = px.scatter(
self.prepared_df,
y=self.protein_id,
y=self.y_axis,
x=self.group,
facet_col=Cols.INDEX,
color=self.group,
labels={self.protein_id: self.y_label},
template="simple_white+alphastats_colors",
)

elif self.method == "all":
fig = px.violin(
self.prepared_df,
y=self.protein_id,
y=self.y_axis,
x=self.group,
facet_col=Cols.INDEX,
color=self.group,
labels={self.protein_id: self.y_label},
box=True,
points="all",
template="simple_white+alphastats_colors",
Expand Down
2 changes: 1 addition & 1 deletion tests/gui/test_02_import_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,5 +106,5 @@ def test_page_02_loads_maxquant_testfiles(

dataset = at.session_state[StateKeys.DATASET]
assert dataset._intensity_column == "LFQ intensity [sample]"
assert dataset.rawmat.shape == (312, 2611)
assert dataset.rawmat.shape == (312, 2249)
assert dataset.software == "MaxQuant"
Loading
Loading