1
+ import ast
1
2
import logging
2
3
import numpy as np
3
4
import os
4
5
import openai
6
+ import requests
5
7
6
8
from datasets import load_dataset , concatenate_datasets
7
9
from distutils .util import strtobool
@@ -78,7 +80,7 @@ def __init__(
78
80
)
79
81
80
82
self .client = openai .OpenAI (
81
- base_url = base_url or os .environ .get ("LEAPFROGAI_API_URL" ),
83
+ base_url = base_url or os .environ .get ("LEAPFROGAI_API_URL" ) + "/openai/v1" ,
82
84
api_key = api_key or os .environ .get ("LEAPFROGAI_API_KEY" ),
83
85
)
84
86
logging .info (f"client url: { self .client .base_url } " )
@@ -91,8 +93,6 @@ def __init__(
91
93
num_copies = int (os .environ .get ("NIAH_NUM_COPIES" , num_copies )),
92
94
)
93
95
self ._create_vector_store ()
94
- self .retrieval_score = None
95
- self .response_score = None
96
96
97
97
def run_experiment (self , cleanup : bool = True ) -> None :
98
98
"""
@@ -110,6 +110,7 @@ def run_experiment(self, cleanup: bool = True) -> None:
110
110
try :
111
111
retrieval_scores = []
112
112
response_scores = []
113
+ chunk_ranks = []
113
114
response_contents = []
114
115
115
116
for row in tqdm (self .niah_data , desc = "Evaluating data rows" ):
@@ -162,32 +163,51 @@ def run_experiment(self, cleanup: bool = True) -> None:
162
163
163
164
retrieval_score = 0.0
164
165
response_score = 0.0
166
+ chunk_rank = 0.0
165
167
response_content = ""
166
168
167
169
for response in response_messages :
168
170
response_content += response .content [0 ].text .value + "\n "
171
+ secret_code = row ["secret_code" ]
172
+ chunk_ids = ast .literal_eval (response .metadata ["vector_ids" ])
169
173
170
174
# retrieval_score
171
- # 1 if needle text was returned by the retrieval step of RAG else 0
172
- logging .debug (
173
- f"number of annotations in response: { len (response .content [0 ].text .annotations )} "
174
- )
175
- for annotation in response .content [0 ].text .annotations :
176
- annotation_id = annotation .file_citation .file_id
177
- if annotation_id == self .current_file :
178
- logging .debug ("Setting retrieval_score to 1.0" )
175
+ # 1 if needle text is found in any chunk in the context, else 0
176
+ # chunk_rank
177
+ # see _calculate_chunk_rank for explanation
178
+ for chunk_num , chunk_id in enumerate (chunk_ids ):
179
+ logging .info (f"chunk { chunk_num } (id: { chunk_id } )" )
180
+ vector_response = requests .get (
181
+ url = os .getenv ("LEAPFROGAI_API_URL" )
182
+ + "/leapfrogai/v1/vector_stores/vector/"
183
+ + chunk_id ,
184
+ headers = {
185
+ "accept" : "application/json" ,
186
+ "Authorization" : "Bearer "
187
+ + os .getenv ("LEAPFROGAI_API_KEY" ),
188
+ },
189
+ ).json ()
190
+ logging .info (f"chunk_data: { vector_response ['content' ]} " )
191
+
192
+ if secret_code in vector_response ["content" ]:
193
+ logging .info (
194
+ f"secret code { secret_code } found in chunk { chunk_num } with id { vector_response ['id' ]} "
195
+ )
196
+ chunk_rank = self ._calculate_chunk_rank (
197
+ chunk_place = chunk_num , total_chunks = len (chunk_ids )
198
+ )
179
199
retrieval_score = 1.0
180
200
181
- # # response_score
182
- # # 1 if needle text was returned by the LLM's final response else 0
183
- secret_code = row ["secret_code" ]
201
+ # response_score
202
+ # 1 if needle text was returned by the LLM's final response else 0
184
203
logging .info (f"Response message: { response .content [0 ].text .value } " )
185
204
if secret_code in response .content [0 ].text .value :
186
205
logging .debug ("Setting response_score to 1.0" )
187
206
response_score = 1.0
188
207
189
208
retrieval_scores .append (retrieval_score )
190
209
response_scores .append (response_score )
210
+ chunk_ranks .append (chunk_rank )
191
211
response_contents .append (response_content )
192
212
193
213
# delete file to clean up the vector store
@@ -210,15 +230,16 @@ def run_experiment(self, cleanup: bool = True) -> None:
210
230
self .niah_data = self .niah_data .add_column (
211
231
name = "response_score" , column = response_scores
212
232
)
233
+ self .niah_data = self .niah_data .add_column (
234
+ name = "chunk_rank" , column = chunk_ranks
235
+ )
213
236
self .niah_data = self .niah_data .add_column (
214
237
name = "response" , column = response_contents
215
238
)
216
239
217
- self .retrieval_score = np .mean (retrieval_scores )
218
- self .response_score = np .mean (response_scores )
219
-
220
- logging .info (f"Retrieval Score { self .retrieval_score } " )
221
- logging .info (f"Response Score { self .response_score } " )
240
+ logging .info (f"Retrieval Score: { np .mean (retrieval_scores )} " )
241
+ logging .info (f"Response Score: { np .mean (response_scores )} " )
242
+ logging .info (f"Chunk Rank Score: { np .mean (chunk_ranks )} " )
222
243
223
244
# remove artifacts from the API if the experiment fails
224
245
except Exception as exc :
@@ -264,7 +285,8 @@ def _load_niah_dataset(
264
285
"""
265
286
logging .info (f"Downloading dataset: { dataset_name } from HuggingFace" )
266
287
niah_dataset = load_dataset (dataset_name )
267
- self .padding = niah_dataset ["padding" ]
288
+ if self .add_padding :
289
+ self .padding = niah_dataset ["padding" ]
268
290
niah_dataset = concatenate_datasets (
269
291
[
270
292
niah_dataset ["base_eval" ],
@@ -339,8 +361,11 @@ def _create_vector_store(self) -> VectorStore:
339
361
logging .debug (
340
362
f"Added { len (self .padding )} files as padding to the haystack vector store"
341
363
)
364
+ self .padding = self .padding .add_column (
365
+ name = "padding_id" , column = padding_ids
366
+ )
367
+
342
368
self .vector_store = vector_store
343
- self .padding = self .padding .add_column (name = "padding_id" , column = padding_ids )
344
369
345
370
def _delete_vector_store (self , vector_store_id : str ) -> None :
346
371
"""Deletes the vector store used for all NIAH evaluations"""
@@ -360,3 +385,28 @@ def _delete_file(self, file_id: str) -> None:
360
385
file_id = file_id , vector_store_id = self .vector_store .id
361
386
)
362
387
self .client .files .delete (file_id = file_id )
388
+
389
+ def _calculate_chunk_rank (self , chunk_place : int , total_chunks : int ) -> float :
390
+ """
391
+ Calculate an individual chunk's rank
392
+
393
+ When a needle is found in a certain chunk, we caclulate the rank of that chunk
394
+ This rank is based on what place in the responses it came (between 0 and total_chunks-1)
395
+ using this formula:
396
+
397
+ chunk_rank_score = (total_chunks - chunk_place) / total_chunks
398
+
399
+ e.g
400
+ total_chunks = 5
401
+ chunk_place = 0 (first in the list)
402
+ chunk_rank_score = (5 - 0) / 5 = 1.0
403
+
404
+ e.g
405
+ total_chunks = 5
406
+ chunk_place = 4 (last in 0 indexed list)
407
+ chunk_rank_score = (5 - 4) / 5 = 0.2
408
+
409
+ not finding the needle results in a score of 0 (set outside this function)
410
+ """
411
+ chunk_rank_score = float (total_chunks - chunk_place ) / float (total_chunks )
412
+ return chunk_rank_score
0 commit comments