feat: use chunk data in NIAH and QA evals (#1176)

jalling97 · web-flow · commit ad697cdf2bf3 · 2024-10-07T15:30:05.000-04:00
* incorporate chunk data into NIAH retrieval metric
* add chunk_rank metric
* add DeepEval metrics
* fix NIAH padding bug
diff --git a/src/leapfrogai_evals/.env.example b/src/leapfrogai_evals/.env.example
@@ -1,4 +1,4 @@
-LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev/openai/v1"
+LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev"
 LEAPFROGAI_API_KEY="lfai-api-key"
 ANTHROPIC_API_KEY="anthropic-api-key"
 
diff --git a/src/leapfrogai_evals/README.md b/src/leapfrogai_evals/README.md
@@ -18,7 +18,7 @@ cp .env.example .env
 Within `.env`, replace the necessary environment variables:
 
 ```bash
-LEAPFROGAI_API_URL=<LeapfrogAI API url, usually: https://leapfrogai-api.uds.dev/openai/v1 for development>
+LEAPFROGAI_API_URL=<LeapfrogAI API url, usually: https://leapfrogai-api.uds.dev for development>
 LEAPFROGAI_API_KEY=<LeapfrogAI API key>
 ANTHROPIC_API_KEY=<Anthropic API key>
 ```
@@ -108,6 +108,7 @@ The LeapfrogAI NIAH evaluation uses the following process:
     - prompt the LLM to provide the secret code hidden in the context
     - record the following:
         - whether or not the needle text was returned by the retrieval step of RAG
+        - which chunk from the retrieval step the needle was found in, if present
         - whether or not the needle text was returned by the LLM's final response
     - delete the contextual document from the vector store
     - delete the assistant
diff --git a/src/leapfrogai_evals/evals/niah_eval.py b/src/leapfrogai_evals/evals/niah_eval.py
@@ -3,7 +3,7 @@
 
 from deepeval.test_case import LLMTestCase
 
-from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response
+from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response, NIAH_Chunk_Rank
 from leapfrogai_evals.runners import NIAH_Runner
 
 
@@ -26,6 +26,7 @@ def niah_eval(*args, **kwargs) -> dict:
                 additional_metadata={
                     "retrieval_score": row["retrieval_score"],
                     "response_score": row["response_score"],
+                    "chunk_rank": row["chunk_rank"],
                 },
             )
         )
@@ -34,7 +35,8 @@ def niah_eval(*args, **kwargs) -> dict:
     # TODO: Give ability to choose which metrics to run
     retrieval_metric = NIAH_Retrieval()
     response_metric = NIAH_Response()
-    metrics = [retrieval_metric, response_metric]
+    chunk_rank_metric = NIAH_Chunk_Rank()
+    metrics = [retrieval_metric, response_metric, chunk_rank_metric]
 
     # record scores and return results
     for metric in metrics:
diff --git a/src/leapfrogai_evals/evals/qa_eval.py b/src/leapfrogai_evals/evals/qa_eval.py
@@ -2,7 +2,11 @@
 import numpy as np
 import os
 
-from deepeval.metrics import AnswerRelevancyMetric
+from deepeval.metrics import (
+    AnswerRelevancyMetric,
+    ContextualRelevancyMetric,
+    FaithfulnessMetric,
+)
 from deepeval.test_case import LLMTestCase
 
 from leapfrogai_evals.metrics import AnnotationRelevancyMetric, CorrectnessMetric
@@ -27,11 +31,11 @@ def qa_eval(*args, **kwargs) -> dict:
                 actual_output=row["actual_output"],
                 context=row["context"],
                 expected_output=row["expected_output"],
+                retrieval_context=row["retrieval_context"],
                 additional_metadata={
                     "actual_annotations": row["actual_annotations"],
                     "expected_annotations": row["expected_annotations"],
                 },
-                # retrieval_context = row['retrieval_context'] # TODO: add this for more metrics
             )
         )
 
@@ -45,10 +49,14 @@ def qa_eval(*args, **kwargs) -> dict:
     # TODO: Give ability to choose which metrics to run
     correctness_metric = CorrectnessMetric(model=judge_model)
     answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model)
+    contextual_relevancy_metric = ContextualRelevancyMetric(model=judge_model)
+    faithfulness_metric = FaithfulnessMetric(model=judge_model)
     annotation_relevancy_metric = AnnotationRelevancyMetric()
     metrics = [
         correctness_metric,
         answer_relevancy_metric,
+        contextual_relevancy_metric,
+        faithfulness_metric,
         annotation_relevancy_metric,
     ]
 
diff --git a/src/leapfrogai_evals/metrics/__init__.py b/src/leapfrogai_evals/metrics/__init__.py
@@ -3,4 +3,8 @@
 
 from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric
 from leapfrogai_evals.metrics.correctness import CorrectnessMetric
-from leapfrogai_evals.metrics.niah_metrics import NIAH_Response, NIAH_Retrieval
+from leapfrogai_evals.metrics.niah_metrics import (
+    NIAH_Response,
+    NIAH_Retrieval,
+    NIAH_Chunk_Rank,
+)
diff --git a/src/leapfrogai_evals/metrics/niah_metrics.py b/src/leapfrogai_evals/metrics/niah_metrics.py
@@ -109,3 +109,57 @@ def is_successful(self) -> bool:
     @property
     def __name__(self):
         return "Needle in a Haystack (NIAH) Response"
+
+
+class NIAH_Chunk_Rank(BaseMetric):
+    """A metric for measuring the chunk rank score from the LFAI Needle in a Haystack Evaluation"""
+
+    def __init__(
+        self,
+        threshold: float = 1.0,
+        async_mode: bool = True,
+    ):
+        self.threshold = threshold
+        self.async_mode = async_mode
+
+    def measure(self, test_case: LLMTestCase) -> int:
+        """
+        Records the niah chunk_rank from the test case
+
+        This function checks for the presence of a chunk rank (provided by the niah_runner)
+        and sets a boolean determined by said score. The score is calculated in the runner to keep the
+        runner self-contained as a means of running the entire evaluation on its own. For simplicity,
+        the score is copied here for integration with DeepEval.
+
+        params:
+        -------
+        test_case: LLMTestCase
+            A test case object built from the results of a needle in a haystack evaluation run.
+            test_case should contain an additional metadata field that returns a dictionary with
+            the field "chunk_rank"
+
+        returns:
+        -------
+        int
+            A score that is equal to the "chunk_rank" from the test_case
+        """
+        self.score = test_case.additional_metadata["chunk_rank"]
+        self.success = self.score >= self.threshold
+
+        if self.success:
+            self.reason = f"Response in the NIAH evaluation scored greater than or equal to the threshold score of {self.threshold}"
+        else:
+            self.reason = f"Response in the NIAH evaluation scored less than the threshold score of {self.threshold}"
+
+        return self.score
+
+    async def a_measure(self, test_case: LLMTestCase) -> int:
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self.measure, test_case)
+
+    def is_successful(self) -> bool:
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Needle in a Haystack (NIAH) Chunk Rank"
diff --git a/src/leapfrogai_evals/models/lfai.py b/src/leapfrogai_evals/models/lfai.py
@@ -24,7 +24,7 @@ def __init__(
     ):
         self.model = model or os.getenv("MODEL_TO_EVALUATE")
         self.api_key = api_key or os.getenv("LEAPFROGAI_API_KEY")
-        self.base_url = base_url or os.getenv("LEAPFROGAI_API_URL")
+        self.base_url = base_url or os.getenv("LEAPFROGAI_API_URL") + "/openai/v1"
         self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
 
     def load_model(self):
diff --git a/src/leapfrogai_evals/runners/niah_runner.py b/src/leapfrogai_evals/runners/niah_runner.py
@@ -1,7 +1,9 @@
+import ast
 import logging
 import numpy as np
 import os
 import openai
+import requests
 
 from datasets import load_dataset, concatenate_datasets
 from distutils.util import strtobool
@@ -78,7 +80,7 @@ def __init__(
             )
 
         self.client = openai.OpenAI(
-            base_url=base_url or os.environ.get("LEAPFROGAI_API_URL"),
+            base_url=base_url or os.environ.get("LEAPFROGAI_API_URL") + "/openai/v1",
             api_key=api_key or os.environ.get("LEAPFROGAI_API_KEY"),
         )
         logging.info(f"client url: {self.client.base_url}")
@@ -91,8 +93,6 @@ def __init__(
             num_copies=int(os.environ.get("NIAH_NUM_COPIES", num_copies)),
         )
         self._create_vector_store()
-        self.retrieval_score = None
-        self.response_score = None
 
     def run_experiment(self, cleanup: bool = True) -> None:
         """
@@ -110,6 +110,7 @@ def run_experiment(self, cleanup: bool = True) -> None:
         try:
             retrieval_scores = []
             response_scores = []
+            chunk_ranks = []
             response_contents = []
 
             for row in tqdm(self.niah_data, desc="Evaluating data rows"):
@@ -162,32 +163,51 @@ def run_experiment(self, cleanup: bool = True) -> None:
 
                 retrieval_score = 0.0
                 response_score = 0.0
+                chunk_rank = 0.0
                 response_content = ""
 
                 for response in response_messages:
                     response_content += response.content[0].text.value + "\n"
+                    secret_code = row["secret_code"]
+                    chunk_ids = ast.literal_eval(response.metadata["vector_ids"])
 
                     # retrieval_score
-                    # 1 if needle text was returned by the retrieval step of RAG else 0
-                    logging.debug(
-                        f"number of annotations in response: {len(response.content[0].text.annotations)}"
-                    )
-                    for annotation in response.content[0].text.annotations:
-                        annotation_id = annotation.file_citation.file_id
-                        if annotation_id == self.current_file:
-                            logging.debug("Setting retrieval_score to 1.0")
+                    # 1 if needle text is found in any chunk in the context, else 0
+                    # chunk_rank
+                    # see _calculate_chunk_rank for explanation
+                    for chunk_num, chunk_id in enumerate(chunk_ids):
+                        logging.info(f"chunk {chunk_num} (id: {chunk_id})")
+                        vector_response = requests.get(
+                            url=os.getenv("LEAPFROGAI_API_URL")
+                            + "/leapfrogai/v1/vector_stores/vector/"
+                            + chunk_id,
+                            headers={
+                                "accept": "application/json",
+                                "Authorization": "Bearer "
+                                + os.getenv("LEAPFROGAI_API_KEY"),
+                            },
+                        ).json()
+                        logging.info(f"chunk_data: {vector_response['content']}")
+
+                        if secret_code in vector_response["content"]:
+                            logging.info(
+                                f"secret code {secret_code} found in chunk {chunk_num} with id {vector_response['id']}"
+                            )
+                            chunk_rank = self._calculate_chunk_rank(
+                                chunk_place=chunk_num, total_chunks=len(chunk_ids)
+                            )
                             retrieval_score = 1.0
 
-                    # # response_score
-                    # # 1 if needle text was returned by the LLM's final response else 0
-                    secret_code = row["secret_code"]
+                    # response_score
+                    # 1 if needle text was returned by the LLM's final response else 0
                     logging.info(f"Response message: {response.content[0].text.value}")
                     if secret_code in response.content[0].text.value:
                         logging.debug("Setting response_score to 1.0")
                         response_score = 1.0
 
                 retrieval_scores.append(retrieval_score)
                 response_scores.append(response_score)
+                chunk_ranks.append(chunk_rank)
                 response_contents.append(response_content)
 
                 # delete file to clean up the vector store
@@ -210,15 +230,16 @@ def run_experiment(self, cleanup: bool = True) -> None:
             self.niah_data = self.niah_data.add_column(
                 name="response_score", column=response_scores
             )
+            self.niah_data = self.niah_data.add_column(
+                name="chunk_rank", column=chunk_ranks
+            )
             self.niah_data = self.niah_data.add_column(
                 name="response", column=response_contents
             )
 
-            self.retrieval_score = np.mean(retrieval_scores)
-            self.response_score = np.mean(response_scores)
-
-            logging.info(f"Retrieval Score {self.retrieval_score}")
-            logging.info(f"Response Score {self.response_score}")
+            logging.info(f"Retrieval Score: {np.mean(retrieval_scores)}")
+            logging.info(f"Response Score: {np.mean(response_scores)}")
+            logging.info(f"Chunk Rank Score: {np.mean(chunk_ranks)}")
 
         # remove artifacts from the API if the experiment fails
         except Exception as exc:
@@ -264,7 +285,8 @@ def _load_niah_dataset(
         """
         logging.info(f"Downloading dataset: {dataset_name} from HuggingFace")
         niah_dataset = load_dataset(dataset_name)
-        self.padding = niah_dataset["padding"]
+        if self.add_padding:
+            self.padding = niah_dataset["padding"]
         niah_dataset = concatenate_datasets(
             [
                 niah_dataset["base_eval"],
@@ -339,8 +361,11 @@ def _create_vector_store(self) -> VectorStore:
             logging.debug(
                 f"Added {len(self.padding)} files as padding to the haystack vector store"
             )
+            self.padding = self.padding.add_column(
+                name="padding_id", column=padding_ids
+            )
+
         self.vector_store = vector_store
-        self.padding = self.padding.add_column(name="padding_id", column=padding_ids)
 
     def _delete_vector_store(self, vector_store_id: str) -> None:
         """Deletes the vector store used for all NIAH evaluations"""
@@ -360,3 +385,28 @@ def _delete_file(self, file_id: str) -> None:
                 file_id=file_id, vector_store_id=self.vector_store.id
             )
         self.client.files.delete(file_id=file_id)
+
+    def _calculate_chunk_rank(self, chunk_place: int, total_chunks: int) -> float:
+        """
+        Calculate an individual chunk's rank
+
+        When a needle is found in a certain chunk, we caclulate the rank of that chunk
+        This rank is based on what place in the responses it came (between 0 and total_chunks-1)
+        using this formula:
+
+        chunk_rank_score = (total_chunks - chunk_place) / total_chunks
+
+        e.g
+        total_chunks = 5
+        chunk_place = 0 (first in the list)
+        chunk_rank_score = (5 - 0) / 5 = 1.0
+
+        e.g
+        total_chunks = 5
+        chunk_place = 4 (last in 0 indexed list)
+        chunk_rank_score = (5 - 4) / 5 = 0.2
+
+        not finding the needle results in a score of 0 (set outside this function)
+        """
+        chunk_rank_score = float(total_chunks - chunk_place) / float(total_chunks)
+        return chunk_rank_score
diff --git a/src/leapfrogai_evals/runners/qa_runner.py b/src/leapfrogai_evals/runners/qa_runner.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev/openai/v1"`
	`1`	`+LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev"`
`2`	`2`	`LEAPFROGAI_API_KEY="lfai-api-key"`
`3`	`3`	`ANTHROPIC_API_KEY="anthropic-api-key"`
`4`	`4`