Skip to content

Commit 94f1097

Browse files
updated evaluations API
1 parent 9cd9e26 commit 94f1097

File tree

5 files changed

+209
-179
lines changed

5 files changed

+209
-179
lines changed

evaluation/experiments.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
2+
from training_utils import *
3+
from concurrent.futures import ThreadPoolExecutor, as_completed
4+
from tqdm.auto import tqdm
5+
6+
def exp_0(process_query_elastic_search, eval_path, query_key_dict):
7+
"""
8+
EXPERIMENT 0
9+
Get results from elastic search
10+
"""
11+
# cols = ,did,name,query,llm_model,embedding_model,llm_before_rag
12+
# for every query, get the results from elastic search
13+
if not os.path.exists(eval_path / "elasticsearch" / "elasticsearch"):
14+
os.makedirs(eval_path / "elasticsearch" / "elasticsearch")
15+
output_file_path = eval_path / "elasticsearch" / "elasticsearch" / "results.csv"
16+
# check if the file exists and skip
17+
if os.path.exists(output_file_path) == False:
18+
with open(output_file_path, "w") as f:
19+
f.write("did,name,query,llm_model,embedding_model,llm_before_rag\n")
20+
21+
# Use ThreadPoolExecutor to parallelize requests
22+
with ThreadPoolExecutor(max_workers=10) as executor:
23+
# Start a future for each query
24+
futures = {executor.submit(process_query_elastic_search, query, dataset_id): query for query, dataset_id
25+
in
26+
query_key_dict.items()}
27+
28+
for future in tqdm(as_completed(futures), total=len(futures)):
29+
result = future.result()
30+
# Save the results to a CSV file
31+
for id, query in result:
32+
f.write(f"{id},None,{query},es,es,None\n")
33+
34+
def exp_1(eval_path, config, list_of_embedding_models, list_of_llm_models, subset_ids, query_key_dict):
35+
"""
36+
EXPERIMENT 1
37+
Main evaluation loop that is used to run the base experiments using different models and embeddings.
38+
Takes into account the following:
39+
original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing
40+
list_of_embedding_models = [
41+
"BAAI/bge-large-en-v1.5",
42+
"BAAI/bge-base-en-v1.5",
43+
"Snowflake/snowflake-arctic-embed-l",
44+
]
45+
list_of_llm_models = ["llama3", "phi3"]
46+
types_of_llm_apply : llm applied as filter before the RAG pipeline, llm applied as reranker after the RAG pipeline, llm not used at all
47+
"""
48+
49+
expRunner = ExperimentRunner(
50+
config=config,
51+
eval_path=eval_path,
52+
queries=query_key_dict.keys(),
53+
list_of_embedding_models=list_of_embedding_models,
54+
list_of_llm_models=list_of_llm_models,
55+
subset_ids=subset_ids,
56+
use_cached_experiment=True,
57+
)
58+
expRunner.run_experiments()
59+
60+
def exp_2(eval_path, config, subset_ids, query_key_dict):
61+
"""
62+
EXPERIMENT 2
63+
Evaluating temperature = 1 (default was 0.95)
64+
Takes into account the following:
65+
original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing
66+
list_of_embedding_models = [
67+
"BAAI/bge-large-en-v1.5",
68+
]
69+
list_of_llm_models = ["llama3"]
70+
types_of_llm_apply : llm applied as filter before the RAG pipeline, llm applied as reranker after the RAG pipeline, llm not used at all
71+
"""
72+
73+
list_of_embedding_models = [
74+
"BAAI/bge-large-en-v1.5",
75+
]
76+
list_of_llm_models = ["llama3"]
77+
config["temperature"] = 1
78+
79+
expRunner = ExperimentRunner(
80+
config=config,
81+
eval_path=eval_path,
82+
queries=query_key_dict.keys(),
83+
list_of_embedding_models=list_of_embedding_models,
84+
list_of_llm_models=list_of_llm_models,
85+
subset_ids=subset_ids,
86+
use_cached_experiment=True,
87+
custom_name="temperature_1",
88+
)
89+
expRunner.run_experiments()
90+
91+
# reset the temperature to the default value
92+
config["temperature"] = 0.95
93+
94+
def exp_3(eval_path, config, subset_ids, query_key_dict):
95+
"""
96+
EXPERIMENT 3
97+
Evaluating search type [mmr, similarity_score_threshold] (default was similarity)
98+
Takes into account the following:
99+
original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing
100+
list_of_embedding_models = [
101+
"BAAI/bge-large-en-v1.5",
102+
]
103+
list_of_llm_models = ["llama3"]
104+
types_of_llm_apply : llm applied as reranker after the RAG pipeline
105+
"""
106+
107+
108+
list_of_embedding_models = [
109+
"BAAI/bge-large-en-v1.5",
110+
]
111+
list_of_llm_models = ["llama3"]
112+
types_of_llm_apply = [False]
113+
types_of_search = ["mmr", "similarity_score_threshold"]
114+
115+
for type_of_search in types_of_search:
116+
config["search_type"] = type_of_search
117+
expRunner = ExperimentRunner(
118+
config=config,
119+
eval_path=eval_path,
120+
queries=query_key_dict.keys(),
121+
list_of_embedding_models=list_of_embedding_models,
122+
list_of_llm_models=list_of_llm_models,
123+
subset_ids=subset_ids,
124+
use_cached_experiment=True,
125+
custom_name=f"{type_of_search}_search",
126+
types_of_llm_apply=types_of_llm_apply,
127+
)
128+
expRunner.run_experiments()
129+
130+
# reset the search type to the default value
131+
config["search_type"] = "similarity"
132+
133+
def exp_4(eval_path, config, subset_ids, query_key_dict):
134+
"""
135+
EXPERIMENT 4
136+
Evaluating chunk size. The default is 1000, trying out 512,128
137+
Takes into account the following:
138+
original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing
139+
list_of_embedding_models = [
140+
"BAAI/bge-large-en-v1.5",
141+
]
142+
list_of_llm_models = ["llama3"]
143+
types_of_llm_apply : llm applied as reranker after the RAG pipeline
144+
"""
145+
146+
147+
list_of_embedding_models = [
148+
"BAAI/bge-large-en-v1.5",
149+
]
150+
list_of_llm_models = ["llama3"]
151+
types_of_llm_apply = [False]
152+
types_of_chunk = [512, 128]
153+
for type_of_chunk in types_of_chunk:
154+
config["chunk_size"] = type_of_chunk
155+
expRunner = ExperimentRunner(
156+
config=config,
157+
eval_path=eval_path,
158+
queries=query_key_dict.keys(),
159+
list_of_embedding_models=list_of_embedding_models,
160+
list_of_llm_models=list_of_llm_models,
161+
subset_ids=subset_ids,
162+
use_cached_experiment=True,
163+
custom_name=f"{type_of_chunk}_chunk",
164+
types_of_llm_apply=types_of_llm_apply,
165+
)
166+
expRunner.run_experiments()
167+
168+
# reset the search type to the default value
169+
config["chunk_size"] = 1000

evaluation/run_all_training.py

Lines changed: 13 additions & 175 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
import json
88
from pathlib import Path
99
import os
10-
from concurrent.futures import ThreadPoolExecutor, as_completed
1110
from backend.modules.utils import load_config_and_device
1211
from training_utils import *
12+
from experiments import *
1313

1414
if __name__ == "__main__":
1515
# %%
@@ -72,184 +72,22 @@
7272
subset_ids = list(set([int(item) for sublist in subset_ids for item in sublist]))
7373
# %%
7474
# get the queries for the datasets
75-
query_key_dict = {}
76-
for template in query_templates:
77-
for row in load_eval_queries.itertuples():
78-
new_query = f"{template} {row[1]}".strip()
79-
# load_eval_queries.at[query, "Query"] = new_query
80-
if new_query not in query_key_dict:
81-
query_key_dict[new_query.strip()] = row[2]
82-
83-
75+
query_key_dict = get_queries(query_templates=query_templates, load_eval_queries=load_eval_queries)
8476
json.dump(query_key_dict, open(eval_path / "query_key_dict.json", "w"))
8577

86-
"""
87-
EXPERIMENT 0
88-
Get results from elastic search
89-
"""
90-
# cols = ,did,name,query,llm_model,embedding_model,llm_before_rag
91-
92-
# for every query, get the results from elastic search
93-
if not os.path.exists(eval_path / "elasticsearch"/ "elasticsearch"):
94-
os.makedirs(eval_path / "elasticsearch" / "elasticsearch")
95-
96-
output_file_path = eval_path / "elasticsearch" / "elasticsearch" / "results.csv"
97-
98-
def process_query(query, dataset_id):
99-
res = get_elastic_search_results(query)
100-
ids = [val["_id"] for val in res]
101-
return [(id, query) for id in ids]
102-
103-
# check if the file exists and skip
104-
if os.path.exists(output_file_path) == False:
105-
with open(output_file_path, "w") as f:
106-
f.write("did,name,query,llm_model,embedding_model,llm_before_rag\n")
107-
108-
# Use ThreadPoolExecutor to parallelize requests
109-
with ThreadPoolExecutor(max_workers=10) as executor:
110-
# Start a future for each query
111-
futures = {executor.submit(process_query, query, dataset_id): query for query, dataset_id in
112-
query_key_dict.items()}
113-
114-
for future in tqdm(as_completed(futures), total=len(futures)):
115-
result = future.result()
116-
# Save the results to a CSV file
117-
for id, query in result:
118-
f.write(f"{id},None,{query},es,es,None\n")
119-
120-
"""
121-
EXPERIMENT 1
122-
Main evaluation loop that is used to run the base experiments using different models and embeddings.
123-
Takes into account the following:
124-
original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing
125-
list_of_embedding_models = [
126-
"BAAI/bge-large-en-v1.5",
127-
"BAAI/bge-base-en-v1.5",
128-
"Snowflake/snowflake-arctic-embed-l",
129-
]
130-
list_of_llm_models = ["llama3", "phi3"]
131-
types_of_llm_apply : llm applied as filter before the RAG pipeline, llm applied as reranker after the RAG pipeline, llm not used at all
132-
"""
133-
134-
expRunner = ExperimentRunner(
135-
config=config,
136-
eval_path=eval_path,
137-
queries=query_key_dict.keys(),
138-
list_of_embedding_models=list_of_embedding_models,
139-
list_of_llm_models=list_of_llm_models,
140-
subset_ids=subset_ids,
141-
use_cached_experiment=True,
142-
)
143-
expRunner.run_experiments()
144-
145-
"""
146-
EXPERIMENT 2
147-
148-
Evaluating temperature = 1 (default was 0.95)
149-
Takes into account the following:
150-
original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing
151-
list_of_embedding_models = [
152-
"BAAI/bge-large-en-v1.5",
153-
]
154-
list_of_llm_models = ["llama3"]
155-
types_of_llm_apply : llm applied as filter before the RAG pipeline, llm applied as reranker after the RAG pipeline, llm not used at all
156-
"""
78+
# Run experiments on just queries and not filters
15779

158-
list_of_embedding_models = [
159-
"BAAI/bge-large-en-v1.5",
160-
]
161-
list_of_llm_models = ["llama3"]
162-
config["temperature"] = 1
163-
164-
expRunner = ExperimentRunner(
165-
config=config,
166-
eval_path=eval_path,
167-
queries=query_key_dict.keys(),
168-
list_of_embedding_models=list_of_embedding_models,
169-
list_of_llm_models=list_of_llm_models,
170-
subset_ids=subset_ids,
171-
use_cached_experiment=True,
172-
custom_name="temperature_1",
173-
)
174-
expRunner.run_experiments()
175-
176-
# reset the temperature to the default value
177-
config["temperature"] = 0.95
178-
179-
"""
180-
EXPERIMENT 3
181-
182-
Evaluating search type [mmr, similarity_score_threshold] (default was similarity)
183-
Takes into account the following:
184-
original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing
185-
list_of_embedding_models = [
186-
"BAAI/bge-large-en-v1.5",
187-
]
188-
list_of_llm_models = ["llama3"]
189-
types_of_llm_apply : llm applied as reranker after the RAG pipeline
190-
"""
80+
# Get results from elastic search
81+
exp_0(process_query_elastic_search, eval_path, query_key_dict)
19182

83+
# Experiment 1 : Run the base experiments using different models and embeddings
84+
exp_1(eval_path, config, list_of_embedding_models, list_of_llm_models, subset_ids, query_key_dict)
19285

193-
list_of_embedding_models = [
194-
"BAAI/bge-large-en-v1.5",
195-
]
196-
list_of_llm_models = ["llama3"]
197-
types_of_llm_apply = [False]
198-
types_of_search = ["mmr", "similarity_score_threshold"]
199-
200-
for type_of_search in types_of_search:
201-
config["search_type"] = type_of_search
202-
expRunner = ExperimentRunner(
203-
config=config,
204-
eval_path=eval_path,
205-
queries=query_key_dict.keys(),
206-
list_of_embedding_models=list_of_embedding_models,
207-
list_of_llm_models=list_of_llm_models,
208-
subset_ids=subset_ids,
209-
use_cached_experiment=True,
210-
custom_name=f"{type_of_search}_search",
211-
types_of_llm_apply=types_of_llm_apply,
212-
)
213-
expRunner.run_experiments()
214-
215-
# reset the search type to the default value
216-
config["search_type"] = "similarity"
217-
218-
219-
"""
220-
EXPERIMENT 4
221-
222-
Evaluating chunk size. The default is 1000, trying out 512,128
223-
Takes into account the following:
224-
original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing
225-
list_of_embedding_models = [
226-
"BAAI/bge-large-en-v1.5",
227-
]
228-
list_of_llm_models = ["llama3"]
229-
types_of_llm_apply : llm applied as reranker after the RAG pipeline
230-
"""
86+
# Experiment 2 : Evaluating temperature = 1 (default was 0.95)
87+
exp_2(eval_path, config, subset_ids, query_key_dict)
23188

89+
# Experiment 3 : Evaluating search type [mmr, similarity_score_threshold] (default was similarity)
90+
exp_3(eval_path, config, subset_ids, query_key_dict)
23291

233-
list_of_embedding_models = [
234-
"BAAI/bge-large-en-v1.5",
235-
]
236-
list_of_llm_models = ["llama3"]
237-
types_of_llm_apply = [False]
238-
types_of_chunk = [512, 128]
239-
for type_of_chunk in types_of_chunk:
240-
config["chunk_size"] = type_of_chunk
241-
expRunner = ExperimentRunner(
242-
config=config,
243-
eval_path=eval_path,
244-
queries=query_key_dict.keys(),
245-
list_of_embedding_models=list_of_embedding_models,
246-
list_of_llm_models=list_of_llm_models,
247-
subset_ids=subset_ids,
248-
use_cached_experiment=True,
249-
custom_name=f"{type_of_chunk}_chunk",
250-
types_of_llm_apply=types_of_llm_apply,
251-
)
252-
expRunner.run_experiments()
253-
254-
# reset the search type to the default value
255-
config["chunk_size"] = 1000
92+
# Experiment 4 : Evaluating chunk size. The default is 1000, trying out 512,128
93+
exp_4(eval_path, config, subset_ids, query_key_dict)

0 commit comments

Comments
 (0)