Skip to content
This repository was archived by the owner on Feb 15, 2025. It is now read-only.

Commit ad697cd

Browse files
authored
feat: use chunk data in NIAH and QA evals (#1176)
* incorporate chunk data into NIAH retrieval metric * add chunk_rank metric * add DeepEval metrics * fix NIAH padding bug
1 parent b9f6413 commit ad697cd

File tree

9 files changed

+177
-32
lines changed

9 files changed

+177
-32
lines changed

src/leapfrogai_evals/.env.example

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev/openai/v1"
1+
LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev"
22
LEAPFROGAI_API_KEY="lfai-api-key"
33
ANTHROPIC_API_KEY="anthropic-api-key"
44

src/leapfrogai_evals/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ cp .env.example .env
1818
Within `.env`, replace the necessary environment variables:
1919

2020
```bash
21-
LEAPFROGAI_API_URL=<LeapfrogAI API url, usually: https://leapfrogai-api.uds.dev/openai/v1 for development>
21+
LEAPFROGAI_API_URL=<LeapfrogAI API url, usually: https://leapfrogai-api.uds.dev for development>
2222
LEAPFROGAI_API_KEY=<LeapfrogAI API key>
2323
ANTHROPIC_API_KEY=<Anthropic API key>
2424
```
@@ -108,6 +108,7 @@ The LeapfrogAI NIAH evaluation uses the following process:
108108
- prompt the LLM to provide the secret code hidden in the context
109109
- record the following:
110110
- whether or not the needle text was returned by the retrieval step of RAG
111+
- which chunk from the retrieval step the needle was found in, if present
111112
- whether or not the needle text was returned by the LLM's final response
112113
- delete the contextual document from the vector store
113114
- delete the assistant

src/leapfrogai_evals/evals/niah_eval.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from deepeval.test_case import LLMTestCase
55

6-
from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response
6+
from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response, NIAH_Chunk_Rank
77
from leapfrogai_evals.runners import NIAH_Runner
88

99

@@ -26,6 +26,7 @@ def niah_eval(*args, **kwargs) -> dict:
2626
additional_metadata={
2727
"retrieval_score": row["retrieval_score"],
2828
"response_score": row["response_score"],
29+
"chunk_rank": row["chunk_rank"],
2930
},
3031
)
3132
)
@@ -34,7 +35,8 @@ def niah_eval(*args, **kwargs) -> dict:
3435
# TODO: Give ability to choose which metrics to run
3536
retrieval_metric = NIAH_Retrieval()
3637
response_metric = NIAH_Response()
37-
metrics = [retrieval_metric, response_metric]
38+
chunk_rank_metric = NIAH_Chunk_Rank()
39+
metrics = [retrieval_metric, response_metric, chunk_rank_metric]
3840

3941
# record scores and return results
4042
for metric in metrics:

src/leapfrogai_evals/evals/qa_eval.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
import numpy as np
33
import os
44

5-
from deepeval.metrics import AnswerRelevancyMetric
5+
from deepeval.metrics import (
6+
AnswerRelevancyMetric,
7+
ContextualRelevancyMetric,
8+
FaithfulnessMetric,
9+
)
610
from deepeval.test_case import LLMTestCase
711

812
from leapfrogai_evals.metrics import AnnotationRelevancyMetric, CorrectnessMetric
@@ -27,11 +31,11 @@ def qa_eval(*args, **kwargs) -> dict:
2731
actual_output=row["actual_output"],
2832
context=row["context"],
2933
expected_output=row["expected_output"],
34+
retrieval_context=row["retrieval_context"],
3035
additional_metadata={
3136
"actual_annotations": row["actual_annotations"],
3237
"expected_annotations": row["expected_annotations"],
3338
},
34-
# retrieval_context = row['retrieval_context'] # TODO: add this for more metrics
3539
)
3640
)
3741

@@ -45,10 +49,14 @@ def qa_eval(*args, **kwargs) -> dict:
4549
# TODO: Give ability to choose which metrics to run
4650
correctness_metric = CorrectnessMetric(model=judge_model)
4751
answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model)
52+
contextual_relevancy_metric = ContextualRelevancyMetric(model=judge_model)
53+
faithfulness_metric = FaithfulnessMetric(model=judge_model)
4854
annotation_relevancy_metric = AnnotationRelevancyMetric()
4955
metrics = [
5056
correctness_metric,
5157
answer_relevancy_metric,
58+
contextual_relevancy_metric,
59+
faithfulness_metric,
5260
annotation_relevancy_metric,
5361
]
5462

src/leapfrogai_evals/metrics/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,8 @@
33

44
from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric
55
from leapfrogai_evals.metrics.correctness import CorrectnessMetric
6-
from leapfrogai_evals.metrics.niah_metrics import NIAH_Response, NIAH_Retrieval
6+
from leapfrogai_evals.metrics.niah_metrics import (
7+
NIAH_Response,
8+
NIAH_Retrieval,
9+
NIAH_Chunk_Rank,
10+
)

src/leapfrogai_evals/metrics/niah_metrics.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,3 +109,57 @@ def is_successful(self) -> bool:
109109
@property
110110
def __name__(self):
111111
return "Needle in a Haystack (NIAH) Response"
112+
113+
114+
class NIAH_Chunk_Rank(BaseMetric):
115+
"""A metric for measuring the chunk rank score from the LFAI Needle in a Haystack Evaluation"""
116+
117+
def __init__(
118+
self,
119+
threshold: float = 1.0,
120+
async_mode: bool = True,
121+
):
122+
self.threshold = threshold
123+
self.async_mode = async_mode
124+
125+
def measure(self, test_case: LLMTestCase) -> int:
126+
"""
127+
Records the niah chunk_rank from the test case
128+
129+
This function checks for the presence of a chunk rank (provided by the niah_runner)
130+
and sets a boolean determined by said score. The score is calculated in the runner to keep the
131+
runner self-contained as a means of running the entire evaluation on its own. For simplicity,
132+
the score is copied here for integration with DeepEval.
133+
134+
params:
135+
-------
136+
test_case: LLMTestCase
137+
A test case object built from the results of a needle in a haystack evaluation run.
138+
test_case should contain an additional metadata field that returns a dictionary with
139+
the field "chunk_rank"
140+
141+
returns:
142+
-------
143+
int
144+
A score that is equal to the "chunk_rank" from the test_case
145+
"""
146+
self.score = test_case.additional_metadata["chunk_rank"]
147+
self.success = self.score >= self.threshold
148+
149+
if self.success:
150+
self.reason = f"Response in the NIAH evaluation scored greater than or equal to the threshold score of {self.threshold}"
151+
else:
152+
self.reason = f"Response in the NIAH evaluation scored less than the threshold score of {self.threshold}"
153+
154+
return self.score
155+
156+
async def a_measure(self, test_case: LLMTestCase) -> int:
157+
loop = asyncio.get_running_loop()
158+
return await loop.run_in_executor(None, self.measure, test_case)
159+
160+
def is_successful(self) -> bool:
161+
return self.success
162+
163+
@property
164+
def __name__(self):
165+
return "Needle in a Haystack (NIAH) Chunk Rank"

src/leapfrogai_evals/models/lfai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __init__(
2424
):
2525
self.model = model or os.getenv("MODEL_TO_EVALUATE")
2626
self.api_key = api_key or os.getenv("LEAPFROGAI_API_KEY")
27-
self.base_url = base_url or os.getenv("LEAPFROGAI_API_URL")
27+
self.base_url = base_url or os.getenv("LEAPFROGAI_API_URL") + "/openai/v1"
2828
self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
2929

3030
def load_model(self):

src/leapfrogai_evals/runners/niah_runner.py

Lines changed: 71 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
import ast
12
import logging
23
import numpy as np
34
import os
45
import openai
6+
import requests
57

68
from datasets import load_dataset, concatenate_datasets
79
from distutils.util import strtobool
@@ -78,7 +80,7 @@ def __init__(
7880
)
7981

8082
self.client = openai.OpenAI(
81-
base_url=base_url or os.environ.get("LEAPFROGAI_API_URL"),
83+
base_url=base_url or os.environ.get("LEAPFROGAI_API_URL") + "/openai/v1",
8284
api_key=api_key or os.environ.get("LEAPFROGAI_API_KEY"),
8385
)
8486
logging.info(f"client url: {self.client.base_url}")
@@ -91,8 +93,6 @@ def __init__(
9193
num_copies=int(os.environ.get("NIAH_NUM_COPIES", num_copies)),
9294
)
9395
self._create_vector_store()
94-
self.retrieval_score = None
95-
self.response_score = None
9696

9797
def run_experiment(self, cleanup: bool = True) -> None:
9898
"""
@@ -110,6 +110,7 @@ def run_experiment(self, cleanup: bool = True) -> None:
110110
try:
111111
retrieval_scores = []
112112
response_scores = []
113+
chunk_ranks = []
113114
response_contents = []
114115

115116
for row in tqdm(self.niah_data, desc="Evaluating data rows"):
@@ -162,32 +163,51 @@ def run_experiment(self, cleanup: bool = True) -> None:
162163

163164
retrieval_score = 0.0
164165
response_score = 0.0
166+
chunk_rank = 0.0
165167
response_content = ""
166168

167169
for response in response_messages:
168170
response_content += response.content[0].text.value + "\n"
171+
secret_code = row["secret_code"]
172+
chunk_ids = ast.literal_eval(response.metadata["vector_ids"])
169173

170174
# retrieval_score
171-
# 1 if needle text was returned by the retrieval step of RAG else 0
172-
logging.debug(
173-
f"number of annotations in response: {len(response.content[0].text.annotations)}"
174-
)
175-
for annotation in response.content[0].text.annotations:
176-
annotation_id = annotation.file_citation.file_id
177-
if annotation_id == self.current_file:
178-
logging.debug("Setting retrieval_score to 1.0")
175+
# 1 if needle text is found in any chunk in the context, else 0
176+
# chunk_rank
177+
# see _calculate_chunk_rank for explanation
178+
for chunk_num, chunk_id in enumerate(chunk_ids):
179+
logging.info(f"chunk {chunk_num} (id: {chunk_id})")
180+
vector_response = requests.get(
181+
url=os.getenv("LEAPFROGAI_API_URL")
182+
+ "/leapfrogai/v1/vector_stores/vector/"
183+
+ chunk_id,
184+
headers={
185+
"accept": "application/json",
186+
"Authorization": "Bearer "
187+
+ os.getenv("LEAPFROGAI_API_KEY"),
188+
},
189+
).json()
190+
logging.info(f"chunk_data: {vector_response['content']}")
191+
192+
if secret_code in vector_response["content"]:
193+
logging.info(
194+
f"secret code {secret_code} found in chunk {chunk_num} with id {vector_response['id']}"
195+
)
196+
chunk_rank = self._calculate_chunk_rank(
197+
chunk_place=chunk_num, total_chunks=len(chunk_ids)
198+
)
179199
retrieval_score = 1.0
180200

181-
# # response_score
182-
# # 1 if needle text was returned by the LLM's final response else 0
183-
secret_code = row["secret_code"]
201+
# response_score
202+
# 1 if needle text was returned by the LLM's final response else 0
184203
logging.info(f"Response message: {response.content[0].text.value}")
185204
if secret_code in response.content[0].text.value:
186205
logging.debug("Setting response_score to 1.0")
187206
response_score = 1.0
188207

189208
retrieval_scores.append(retrieval_score)
190209
response_scores.append(response_score)
210+
chunk_ranks.append(chunk_rank)
191211
response_contents.append(response_content)
192212

193213
# delete file to clean up the vector store
@@ -210,15 +230,16 @@ def run_experiment(self, cleanup: bool = True) -> None:
210230
self.niah_data = self.niah_data.add_column(
211231
name="response_score", column=response_scores
212232
)
233+
self.niah_data = self.niah_data.add_column(
234+
name="chunk_rank", column=chunk_ranks
235+
)
213236
self.niah_data = self.niah_data.add_column(
214237
name="response", column=response_contents
215238
)
216239

217-
self.retrieval_score = np.mean(retrieval_scores)
218-
self.response_score = np.mean(response_scores)
219-
220-
logging.info(f"Retrieval Score {self.retrieval_score}")
221-
logging.info(f"Response Score {self.response_score}")
240+
logging.info(f"Retrieval Score: {np.mean(retrieval_scores)}")
241+
logging.info(f"Response Score: {np.mean(response_scores)}")
242+
logging.info(f"Chunk Rank Score: {np.mean(chunk_ranks)}")
222243

223244
# remove artifacts from the API if the experiment fails
224245
except Exception as exc:
@@ -264,7 +285,8 @@ def _load_niah_dataset(
264285
"""
265286
logging.info(f"Downloading dataset: {dataset_name} from HuggingFace")
266287
niah_dataset = load_dataset(dataset_name)
267-
self.padding = niah_dataset["padding"]
288+
if self.add_padding:
289+
self.padding = niah_dataset["padding"]
268290
niah_dataset = concatenate_datasets(
269291
[
270292
niah_dataset["base_eval"],
@@ -339,8 +361,11 @@ def _create_vector_store(self) -> VectorStore:
339361
logging.debug(
340362
f"Added {len(self.padding)} files as padding to the haystack vector store"
341363
)
364+
self.padding = self.padding.add_column(
365+
name="padding_id", column=padding_ids
366+
)
367+
342368
self.vector_store = vector_store
343-
self.padding = self.padding.add_column(name="padding_id", column=padding_ids)
344369

345370
def _delete_vector_store(self, vector_store_id: str) -> None:
346371
"""Deletes the vector store used for all NIAH evaluations"""
@@ -360,3 +385,28 @@ def _delete_file(self, file_id: str) -> None:
360385
file_id=file_id, vector_store_id=self.vector_store.id
361386
)
362387
self.client.files.delete(file_id=file_id)
388+
389+
def _calculate_chunk_rank(self, chunk_place: int, total_chunks: int) -> float:
390+
"""
391+
Calculate an individual chunk's rank
392+
393+
When a needle is found in a certain chunk, we caclulate the rank of that chunk
394+
This rank is based on what place in the responses it came (between 0 and total_chunks-1)
395+
using this formula:
396+
397+
chunk_rank_score = (total_chunks - chunk_place) / total_chunks
398+
399+
e.g
400+
total_chunks = 5
401+
chunk_place = 0 (first in the list)
402+
chunk_rank_score = (5 - 0) / 5 = 1.0
403+
404+
e.g
405+
total_chunks = 5
406+
chunk_place = 4 (last in 0 indexed list)
407+
chunk_rank_score = (5 - 4) / 5 = 0.2
408+
409+
not finding the needle results in a score of 0 (set outside this function)
410+
"""
411+
chunk_rank_score = float(total_chunks - chunk_place) / float(total_chunks)
412+
return chunk_rank_score

0 commit comments

Comments
 (0)