Merge pull request #35 from SCAI-BIO/add-embedding-function-to-data-dict

tiadams · web-flow · commit 5ca0f0a687f1 · 2024-10-21T10:46:06.000+02:00
Add function to directly retrieve embedding from data dictionary
diff --git a/datastew/process/parsing.py b/datastew/process/parsing.py
@@ -1,4 +1,6 @@
 from abc import ABC
+from typing import Dict
+from datastew.embedding import EmbeddingModel, MPNetAdapter
 
 import pandas as pd
 import numpy as np
@@ -52,16 +54,34 @@ def to_dataframe(self) -> pd.DataFrame:
 
 
 class DataDictionarySource(Source):
-    """
-    Contains mapping of variable -> description
-    """
 
     def __init__(self, file_path: str, variable_field: str, description_field: str):
-        self.file_path = file_path
-        self.variable_field = variable_field
-        self.description_field = description_field
-
-    def to_dataframe(self) -> pd.DataFrame:
+        """
+        Initialize the DataDictionarySource with the path to the data dictionary file
+        and the fields that represent the variables and their descriptions.
+
+        :param file_path: Path to the data dictionary file.
+        :param variable_field: The column that contains the variable names.
+        :param description_field: The column that contains the variable descriptions.
+        """
+        self.file_path: str = file_path
+        self.variable_field: str = variable_field
+        self.description_field: str = description_field
+
+    def to_dataframe(self, dropna: bool = True) -> pd.DataFrame:
+        """
+        Load the data dictionary file into a pandas DataFrame, select the variable and 
+        description fields, and ensure they exist. Optionally remove rows with missing 
+        variables or descriptions based on the 'dropna' parameter.
+
+        :param dropna: If True, rows with missing 'variable' or 'description' values are 
+                       dropped. Defaults to True.
+        :return: A DataFrame containing two columns:
+                 - 'variable': The variable names from the data dictionary.
+                 - 'description': The descriptions corresponding to each variable.
+        :raises ValueError: If either the variable field or the description field is not 
+                            found in the data dictionary file.
+        """
         df = super().to_dataframe()
         # sanity check
         if self.variable_field not in df.columns:
@@ -70,9 +90,32 @@ def to_dataframe(self) -> pd.DataFrame:
             raise ValueError(f"Description field {self.description_field} not found in {self.file_path}")
         df = df[[self.variable_field, self.description_field]]
         df = df.rename(columns={self.variable_field: "variable", self.description_field: "description"})
-        df.dropna(subset=["variable", "description"], inplace=True)
+        if dropna:
+            df.dropna(subset=["variable", "description"], inplace=True)
         return df
-
+    
+    def get_embeddings(self, embedding_model: EmbeddingModel = None) -> Dict[str, list]:
+        """
+        Compute embedding vectors for each description in the data dictionary. The 
+        resulting vectors are mapped to their respective variables and returned as a 
+        dictionary.
+
+        :param embedding_model: The embedding model used to compute embeddings for the descriptions.
+                                Defaults to MPNetAdapter.
+        :return: A dictionary where each key is a variable name and the value is the 
+                 embedding vector for the corresponding description.
+        :rtype: Dict[str, list]
+        """
+        # Compute vectors for all descriptions
+        df: pd.DataFrame = self.to_dataframe()
+        descriptions: list[str] = df["description"].tolist()
+        if embedding_model is None:
+            embedding_model = MPNetAdapter()
+        embeddings = embedding_model.get_embeddings(descriptions)
+        # variable identify descriptions -> variable to embedding
+        variable_to_embedding: Dict[str, list] = dict(zip(df["variable"], embeddings))
+        return variable_to_embedding
+        
 
 class EmbeddingSource:
     def __init__(self, source_path: str):
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -39,3 +39,8 @@ def test_parse_data_dict_excel(self):
         mapping_table.add_descriptions(data_dictionary_source)
         mappings = mapping_table.get_mappings()
         self.assertEqual(11, len(mappings))
+
+    def test_get_embeddings(self):
+        vectors = self.data_dictionary_source.get_embeddings()
+        self.assertEqual(len(vectors), 11)
+        self.assertIn("Q_8", vectors)