MannLabs · JuliaS92 · Nov 27, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/alphastats/dataset/dataset.py b/alphastats/dataset/dataset.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from typing import Dict, List, Optional, Tuple, Union
 
 import pandas as pd
@@ -101,23 +102,11 @@ def __init__(
         self.mat: pd.DataFrame = mat
         self.metadata: pd.DataFrame = metadata
         self.preprocessing_info: Dict = preprocessing_info
-
-        self._gene_name_to_protein_id_map = (
-            {
-                k: v
-                for k, v in dict(
-                    zip(
-                        self.rawinput[Cols.GENE_NAMES].tolist(),
-                        self.rawinput[Cols.INDEX].tolist(),
-                    )
-                ).items()
-                if isinstance(k, str)  # avoid having NaN as key
-            }
-            if Cols.GENE_NAMES in self.rawinput.columns
-            else {}
-        )
-        # TODO This is not necessarily unique, and should ideally raise an error in some of our test-data sets that
-        #  contain isoform ids. E.g. TPM1 occurs 5 times in testfiles/maxquant/proteinGroups.txt with different base Protein IDs.
+        (
+            self._gene_to_features_map,
+            self._protein_to_features_map,
+            self._feature_to_repr_map,
+        ) = self._create_id_dicts()
 
         print("DataSet has been created.")
 
@@ -161,6 +150,55 @@ def _check_loader(loader):
                 "Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader"
             )
 
+    def _create_id_dicts(self, sep: str = ";") -> Tuple[dict, dict, dict]:
+        """
+        Create mappings from gene and protein to feature, and from feature to representation.
+        Features are the entities measured in each sample, usually protein groups represented by semicolon separated protein ids.
+        This is to maintain the many-to-many relationships between the three entities feature, protein and gene.
+
+        This method processes the raw input data to generate three dictionaries:
+        1. gene_to_features_map: Maps each gene to a list of features.
+        2. protein_to_features_map: Maps each protein to a list of features.
+        3. feature_to_repr_map: Maps each feature to its representation string.
+
+        Args:
+            sep (str): The separator used to split gene and protein identifiers. Default is ";".
+
+        Returns:
+            Tuple[dict, dict, dict]: A tuple containing three dictionaries:
+            - gene_to_features_map (dict): A dictionary mapping genes to features.
+            - protein_to_features_map (dict): A dictionary mapping proteins to features.
+            - feature_to_repr_map (dict): A dictionary mapping features to their representation strings.
+        """
+
+        features = set(self.mat.columns.to_list())
+        gene_to_features_map = defaultdict(list)
+        protein_to_features_map = defaultdict(list)
+        feature_to_repr_map = {}
+
+        for proteins, feature in zip(
+            self.rawinput[Cols.INDEX], self.rawinput[Cols.INDEX]
+        ):
+            if feature not in features:
+                continue
+            # TODO: Shorten list if too many ids e.g. to id1;...(19) if 20 ids are present
+            feature_to_repr_map[feature] = "ids:" + proteins
+            for protein in proteins.split(sep):
+                protein_to_features_map[protein].append(feature)
+
+        if Cols.GENE_NAMES in self.rawinput.columns:
+            for genes, feature in zip(
+                self.rawinput[Cols.GENE_NAMES], self.rawinput[Cols.INDEX]
+            ):
+                if feature not in features:
+                    continue
+                if isinstance(genes, str):
+                    for gene in genes.split(sep):
+                        gene_to_features_map[gene].append(feature)
+                    feature_to_repr_map[feature] = genes
+
+        return gene_to_features_map, protein_to_features_map, feature_to_repr_map
+
     def _get_preprocess(self) -> Preprocess:
         """Return instance of the Preprocess object."""
         return Preprocess(
@@ -199,6 +237,11 @@ def preprocess(
                 **kwargs,
             )
         )
+        (
+            self._gene_to_features_map,
+            self._protein_to_features_map,
+            self._feature_to_repr_map,
+        ) = self._create_id_dicts()
 
     def reset_preprocessing(self):
         """Reset all preprocessing steps"""
@@ -208,6 +251,11 @@ def reset_preprocessing(self):
             self.metadata,
             self.preprocessing_info,
         ) = self._get_init_dataset()
+        (
+            self._gene_to_features_map,
+            self._protein_to_features_map,
+            self._feature_to_repr_map,
+        ) = self._create_id_dicts()
 
     def batch_correction(self, batch: str) -> None:
         """A wrapper for Preprocess.batch_correction(), see documentation there."""
@@ -419,6 +467,7 @@ def plot_volcano(
             rawinput=self.rawinput,
             metadata=self.metadata,
             preprocessing_info=self.preprocessing_info,
+            feature_to_repr_map=self._feature_to_repr_map,
             group1=group1,
             group2=group2,
             column=column,
@@ -434,26 +483,22 @@ def plot_volcano(
 
         return volcano_plot.plot
 
-    def _get_protein_id_for_gene_name(
+    def _get_features_for_gene_name(
         self,
         gene_name: str,
-    ) -> str:
-        """Get protein id from gene id. If gene id is not present, return gene id, as we might already have a gene id.
-        'VCL;HEL114' -> 'P18206;A0A024QZN4;V9HWK2;B3KXA2;Q5JQ13;B4DKC9;B4DTM7;A0A096LPE1'
+    ) -> list:
+        """Get feature from gene name. If gene name is not present, return gene name, as we might already have a gene id.
+        'HEL114' -> ['P18206;A0A024QZN4;V9HWK2;B3KXA2;Q5JQ13;B4DKC9;B4DTM7;A0A096LPE1']
 
         Args:
             gene_name (str): Gene name
 
         Returns:
-            str: Protein id or gene name if not present in the mapping.
+            list: Protein group ids or gene name if not present in the mapping.
         """
-        if gene_name in self._gene_name_to_protein_id_map:
-            return self._gene_name_to_protein_id_map[gene_name]
-
-        for gene, protein_id in self._gene_name_to_protein_id_map.items():
-            if gene_name in gene.split(";"):
-                return protein_id
-        return gene_name
+        if gene_name in self._gene_to_features_map:
+            return self._gene_to_features_map[gene_name]
+        raise ValueError(f"Gene {gene_name} is not in the (processed) data.")
 
     def plot_intensity(
         self,
@@ -492,7 +537,7 @@ def plot_intensity(
         if gene_name is None and protein_id is not None:
             pass
         elif gene_name is not None and protein_id is None:
-            protein_id = self._get_protein_id_for_gene_name(gene_name)
+            protein_id = self._get_features_for_gene_name(gene_name)
         else:
             raise ValueError(
                 "Either protein_id or gene_name must be provided, but not both."

diff --git a/alphastats/dataset/plotting.py b/alphastats/dataset/plotting.py
@@ -12,6 +12,7 @@
 from alphastats.plots.plot_utils import PlotUtils
 
 
+# TODO: Remove redundancy with PlotlyObject
 class plotly_object(plotly.graph_objs._figure.Figure):
     plotting_data = None
     preprocessing = None

diff --git a/alphastats/dataset/preprocessing.py b/alphastats/dataset/preprocessing.py
@@ -335,7 +335,6 @@ def _normalization(self, method: str) -> None:
     def _log2_transform(self):
         self.mat = np.log2(self.mat)
         self.mat = self.mat.replace([np.inf, -np.inf], np.nan)
-        # TODO: Ideally we wouldn't need to replace infs if all downstream methods can handle them
         self.preprocessing_info.update({PreprocessingStateKeys.LOG2_TRANSFORMED: True})
         print("Data has been log2-transformed.")
 

diff --git a/alphastats/gui/pages/05_Analysis.py b/alphastats/gui/pages/05_Analysis.py
@@ -4,6 +4,8 @@
 from alphastats.gui.utils.analysis_helper import (
     display_analysis_result_with_buttons,
     gather_parameters_and_do_analysis,
+    gather_uniprot_data,
+    get_regulated_features,
 )
 from alphastats.gui.utils.ui_helper import (
     StateKeys,
@@ -92,9 +94,12 @@ def show_start_llm_button(analysis_method: str) -> None:
         if StateKeys.LLM_INTEGRATION in st.session_state:
             del st.session_state[StateKeys.LLM_INTEGRATION]
         st.session_state[StateKeys.LLM_INPUT] = (analysis_object, parameters)
+        regulated_features = get_regulated_features(analysis_object)
+        # TODO: Add confirmation prompt if an excessive number of proteins is to be looked up.
+        gather_uniprot_data(regulated_features)
 
         st.toast("LLM analysis created!", icon="✅")
-        st.page_link("pages/05_LLM.py", label="=> Go to LLM page..")
+        st.page_link("pages/06_LLM.py", label="=> Go to LLM page..")
 
 
 if analysis_result is not None:

diff --git a/alphastats/gui/pages/06_LLM.py b/alphastats/gui/pages/06_LLM.py
@@ -4,17 +4,25 @@
 import streamlit as st
 from openai import AuthenticationError
 
+from alphastats.dataset.keys import Cols
+from alphastats.dataset.plotting import plotly_object
 from alphastats.gui.utils.analysis_helper import (
     display_figure,
 )
 from alphastats.gui.utils.llm_helper import (
+    display_uniprot,
     get_display_proteins_html,
     llm_connection_test,
     set_api_key,
 )
-from alphastats.gui.utils.ui_helper import StateKeys, init_session_state, sidebar_info
+from alphastats.gui.utils.ui_helper import (
+    StateKeys,
+    init_session_state,
+    sidebar_info,
+)
 from alphastats.llm.llm_integration import LLMIntegration, Models
 from alphastats.llm.prompts import get_initial_prompt, get_system_message
+from alphastats.plots.plot_utils import PlotlyObject
 
 OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
 
@@ -99,7 +107,7 @@ def llm_config():
 with c1:
     regulated_genes_df = volcano_plot.res[volcano_plot.res["label"] != ""]
     regulated_genes_dict = dict(
-        zip(regulated_genes_df["label"], regulated_genes_df["color"].tolist())
+        zip(regulated_genes_df[Cols.INDEX], regulated_genes_df["color"].tolist())
     )
 
     if not regulated_genes_dict:
@@ -118,17 +126,37 @@ def llm_config():
     with c11:
         st.write("Upregulated genes")
         st.markdown(
-            get_display_proteins_html(upregulated_genes, True), unsafe_allow_html=True
+            get_display_proteins_html(
+                upregulated_genes,
+                True,
+                annotation_store=st.session_state[StateKeys.ANNOTATION_STORE],
+                feature_to_repr_map=st.session_state[
+                    StateKeys.DATASET
+                ]._feature_to_repr_map,
+            ),
+            unsafe_allow_html=True,
         )
 
     with c12:
         st.write("Downregulated genes")
         st.markdown(
-            get_display_proteins_html(downregulated_genes, False),
+            get_display_proteins_html(
+                downregulated_genes,
+                False,
+                annotation_store=st.session_state[StateKeys.ANNOTATION_STORE],
+                feature_to_repr_map=st.session_state[
+                    StateKeys.DATASET
+                ]._feature_to_repr_map,
+            ),
             unsafe_allow_html=True,
         )
 
 
+st.markdown("##### Select which information from Uniprot to supply to the LLM")
+display_uniprot(
+    regulated_genes_dict, st.session_state[StateKeys.DATASET]._feature_to_repr_map
+)
+
 st.markdown("##### Prompts generated based on analysis input")
 
 model_name = st.session_state[StateKeys.MODEL_NAME]
@@ -218,8 +246,8 @@ def llm_chat(llm_integration: LLMIntegration, show_all: bool = False):
             for artifact in message["artifacts"]:
                 if isinstance(artifact, pd.DataFrame):
                     st.dataframe(artifact)
-                elif "plotly" in str(
-                    type(artifact)
+                elif isinstance(
+                    artifact, (PlotlyObject, plotly_object)
                 ):  # TODO can there be non-plotly types here
                     st.plotly_chart(artifact)
                 elif not isinstance(artifact, str):

diff --git a/alphastats/gui/utils/analysis.py b/alphastats/gui/utils/analysis.py
@@ -189,12 +189,21 @@ def show_widget(self):
         """Gather parameters for intensity plot analysis."""
         super().show_widget()
 
-        protein_id = st.selectbox(
-            "ProteinID/ProteinGroup",
-            options=self._dataset.mat.columns.to_list(),
+        protein_id_or_gene_name = st.selectbox(
+            "Gene or protein identifier to plot",
+            options=list(self._dataset._gene_to_features_map.keys())
+            + list(self._dataset._protein_to_features_map.keys()),
         )
 
-        self._parameters.update({"protein_id": protein_id})
+        self._parameters.update(
+            {
+                "protein_id": self._dataset._gene_to_features_map[
+                    protein_id_or_gene_name
+                ]
+                if protein_id_or_gene_name in self._dataset._gene_to_features_map
+                else self._dataset._protein_to_features_map[protein_id_or_gene_name]
+            }
+        )
 
     def _do_analysis(self):
         """Draw Intensity Plot using the IntensityPlot class."""
@@ -327,6 +336,7 @@ def _do_analysis(self):
             rawinput=self._dataset.rawinput,
             metadata=self._dataset.metadata,
             preprocessing_info=self._dataset.preprocessing_info,
+            feature_to_repr_map=self._dataset._feature_to_repr_map,
             group1=self._parameters["group1"],
             group2=self._parameters["group2"],
             column=self._parameters["column"],

diff --git a/alphastats/gui/utils/analysis_helper.py b/alphastats/gui/utils/analysis_helper.py
@@ -3,7 +3,9 @@
 
 import pandas as pd
 import streamlit as st
+from stqdm import stqdm
 
+from alphastats.dataset.keys import Cols
 from alphastats.gui.utils.analysis import (
     ANALYSIS_OPTIONS,
     PlottingOptions,
@@ -13,6 +15,7 @@
     StateKeys,
     show_button_download_df,
 )
+from alphastats.llm.uniprot_utils import get_annotations_for_feature
 from alphastats.plots.plot_utils import PlotlyObject
 
 
@@ -197,3 +200,49 @@ def gather_parameters_and_do_analysis(
 
     else:
         raise ValueError(f"Analysis method {analysis_method} not found.")
+
+
+def gather_uniprot_data(features: list) -> None:
+    """
+    Gathers UniProt data for a list of features and stores it in the session state.
+
+    Features that are already in the session state are skipped.
+
+    Args:
+        features (list): A list of features for which UniProt data needs to be gathered.
+    Returns:
+        None
+    """
+    for feature in stqdm(
+        features,
+        desc="Retrieving uniprot data on regulated features ...",
+        mininterval=1,
+    ):
+        if feature in st.session_state[StateKeys.ANNOTATION_STORE]:
+            continue
+        # TODO: Add some kind of rate limitation to avoid being locked out by uniprot
+        st.session_state[StateKeys.ANNOTATION_STORE][feature] = (
+            get_annotations_for_feature(feature)
+        )
+
+
+def get_regulated_features(analysis_object: PlotlyObject) -> list:
+    """
+    Retrieve regulated features from the analysis object.
+    This function extracts features that are labeled (i.e., have a non-empty label)
+    from the analysis results. It is specifically designed to work with volcano plots.
+    Args:
+        analysis_object (PlotlyObject): An object containing analysis results,
+                                        including feature indices and labels.
+    Returns:
+        list: A list of regulated features that have non-empty labels.
+    """
+    # TODO: add a method to the AbstractAnalysis class to retrieve regulated features upon analysis to store in the session state. This function here only works for volcano plots.
+    regulated_features = [
+        feature
+        for feature, label in zip(
+            analysis_object.res[Cols.INDEX], analysis_object.res["label"]
+        )
+        if label != ""
+    ]
+    return regulated_features