|
7 | 7 | import json
|
8 | 8 | from pathlib import Path
|
9 | 9 | import os
|
10 |
| -from concurrent.futures import ThreadPoolExecutor, as_completed |
11 | 10 | from backend.modules.utils import load_config_and_device
|
12 | 11 | from training_utils import *
|
| 12 | +from experiments import * |
13 | 13 |
|
14 | 14 | if __name__ == "__main__":
|
15 | 15 | # %%
|
|
72 | 72 | subset_ids = list(set([int(item) for sublist in subset_ids for item in sublist]))
|
73 | 73 | # %%
|
74 | 74 | # get the queries for the datasets
|
75 |
| - query_key_dict = {} |
76 |
| - for template in query_templates: |
77 |
| - for row in load_eval_queries.itertuples(): |
78 |
| - new_query = f"{template} {row[1]}".strip() |
79 |
| - # load_eval_queries.at[query, "Query"] = new_query |
80 |
| - if new_query not in query_key_dict: |
81 |
| - query_key_dict[new_query.strip()] = row[2] |
82 |
| - |
83 |
| - |
| 75 | + query_key_dict = get_queries(query_templates=query_templates, load_eval_queries=load_eval_queries) |
84 | 76 | json.dump(query_key_dict, open(eval_path / "query_key_dict.json", "w"))
|
85 | 77 |
|
86 |
| - """ |
87 |
| - EXPERIMENT 0 |
88 |
| - Get results from elastic search |
89 |
| - """ |
90 |
| - # cols = ,did,name,query,llm_model,embedding_model,llm_before_rag |
91 |
| - |
92 |
| - # for every query, get the results from elastic search |
93 |
| - if not os.path.exists(eval_path / "elasticsearch"/ "elasticsearch"): |
94 |
| - os.makedirs(eval_path / "elasticsearch" / "elasticsearch") |
95 |
| - |
96 |
| - output_file_path = eval_path / "elasticsearch" / "elasticsearch" / "results.csv" |
97 |
| - |
98 |
| - def process_query(query, dataset_id): |
99 |
| - res = get_elastic_search_results(query) |
100 |
| - ids = [val["_id"] for val in res] |
101 |
| - return [(id, query) for id in ids] |
102 |
| - |
103 |
| - # check if the file exists and skip |
104 |
| - if os.path.exists(output_file_path) == False: |
105 |
| - with open(output_file_path, "w") as f: |
106 |
| - f.write("did,name,query,llm_model,embedding_model,llm_before_rag\n") |
107 |
| - |
108 |
| - # Use ThreadPoolExecutor to parallelize requests |
109 |
| - with ThreadPoolExecutor(max_workers=10) as executor: |
110 |
| - # Start a future for each query |
111 |
| - futures = {executor.submit(process_query, query, dataset_id): query for query, dataset_id in |
112 |
| - query_key_dict.items()} |
113 |
| - |
114 |
| - for future in tqdm(as_completed(futures), total=len(futures)): |
115 |
| - result = future.result() |
116 |
| - # Save the results to a CSV file |
117 |
| - for id, query in result: |
118 |
| - f.write(f"{id},None,{query},es,es,None\n") |
119 |
| - |
120 |
| - """ |
121 |
| - EXPERIMENT 1 |
122 |
| - Main evaluation loop that is used to run the base experiments using different models and embeddings. |
123 |
| - Takes into account the following: |
124 |
| - original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing |
125 |
| - list_of_embedding_models = [ |
126 |
| - "BAAI/bge-large-en-v1.5", |
127 |
| - "BAAI/bge-base-en-v1.5", |
128 |
| - "Snowflake/snowflake-arctic-embed-l", |
129 |
| - ] |
130 |
| - list_of_llm_models = ["llama3", "phi3"] |
131 |
| - types_of_llm_apply : llm applied as filter before the RAG pipeline, llm applied as reranker after the RAG pipeline, llm not used at all |
132 |
| - """ |
133 |
| - |
134 |
| - expRunner = ExperimentRunner( |
135 |
| - config=config, |
136 |
| - eval_path=eval_path, |
137 |
| - queries=query_key_dict.keys(), |
138 |
| - list_of_embedding_models=list_of_embedding_models, |
139 |
| - list_of_llm_models=list_of_llm_models, |
140 |
| - subset_ids=subset_ids, |
141 |
| - use_cached_experiment=True, |
142 |
| - ) |
143 |
| - expRunner.run_experiments() |
144 |
| - |
145 |
| - """ |
146 |
| - EXPERIMENT 2 |
147 |
| - |
148 |
| - Evaluating temperature = 1 (default was 0.95) |
149 |
| - Takes into account the following: |
150 |
| - original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing |
151 |
| - list_of_embedding_models = [ |
152 |
| - "BAAI/bge-large-en-v1.5", |
153 |
| - ] |
154 |
| - list_of_llm_models = ["llama3"] |
155 |
| - types_of_llm_apply : llm applied as filter before the RAG pipeline, llm applied as reranker after the RAG pipeline, llm not used at all |
156 |
| - """ |
| 78 | + # Run experiments on just queries and not filters |
157 | 79 |
|
158 |
| - list_of_embedding_models = [ |
159 |
| - "BAAI/bge-large-en-v1.5", |
160 |
| - ] |
161 |
| - list_of_llm_models = ["llama3"] |
162 |
| - config["temperature"] = 1 |
163 |
| - |
164 |
| - expRunner = ExperimentRunner( |
165 |
| - config=config, |
166 |
| - eval_path=eval_path, |
167 |
| - queries=query_key_dict.keys(), |
168 |
| - list_of_embedding_models=list_of_embedding_models, |
169 |
| - list_of_llm_models=list_of_llm_models, |
170 |
| - subset_ids=subset_ids, |
171 |
| - use_cached_experiment=True, |
172 |
| - custom_name="temperature_1", |
173 |
| - ) |
174 |
| - expRunner.run_experiments() |
175 |
| - |
176 |
| - # reset the temperature to the default value |
177 |
| - config["temperature"] = 0.95 |
178 |
| - |
179 |
| - """ |
180 |
| - EXPERIMENT 3 |
181 |
| - |
182 |
| - Evaluating search type [mmr, similarity_score_threshold] (default was similarity) |
183 |
| - Takes into account the following: |
184 |
| - original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing |
185 |
| - list_of_embedding_models = [ |
186 |
| - "BAAI/bge-large-en-v1.5", |
187 |
| - ] |
188 |
| - list_of_llm_models = ["llama3"] |
189 |
| - types_of_llm_apply : llm applied as reranker after the RAG pipeline |
190 |
| - """ |
| 80 | + # Get results from elastic search |
| 81 | + exp_0(process_query_elastic_search, eval_path, query_key_dict) |
191 | 82 |
|
| 83 | + # Experiment 1 : Run the base experiments using different models and embeddings |
| 84 | + exp_1(eval_path, config, list_of_embedding_models, list_of_llm_models, subset_ids, query_key_dict) |
192 | 85 |
|
193 |
| - list_of_embedding_models = [ |
194 |
| - "BAAI/bge-large-en-v1.5", |
195 |
| - ] |
196 |
| - list_of_llm_models = ["llama3"] |
197 |
| - types_of_llm_apply = [False] |
198 |
| - types_of_search = ["mmr", "similarity_score_threshold"] |
199 |
| - |
200 |
| - for type_of_search in types_of_search: |
201 |
| - config["search_type"] = type_of_search |
202 |
| - expRunner = ExperimentRunner( |
203 |
| - config=config, |
204 |
| - eval_path=eval_path, |
205 |
| - queries=query_key_dict.keys(), |
206 |
| - list_of_embedding_models=list_of_embedding_models, |
207 |
| - list_of_llm_models=list_of_llm_models, |
208 |
| - subset_ids=subset_ids, |
209 |
| - use_cached_experiment=True, |
210 |
| - custom_name=f"{type_of_search}_search", |
211 |
| - types_of_llm_apply=types_of_llm_apply, |
212 |
| - ) |
213 |
| - expRunner.run_experiments() |
214 |
| - |
215 |
| - # reset the search type to the default value |
216 |
| - config["search_type"] = "similarity" |
217 |
| - |
218 |
| - |
219 |
| - """ |
220 |
| - EXPERIMENT 4 |
221 |
| - |
222 |
| - Evaluating chunk size. The default is 1000, trying out 512,128 |
223 |
| - Takes into account the following: |
224 |
| - original data ingestion pipeline : combine a string of all metadata fields and the dataset description and embeds them with no pre-processing |
225 |
| - list_of_embedding_models = [ |
226 |
| - "BAAI/bge-large-en-v1.5", |
227 |
| - ] |
228 |
| - list_of_llm_models = ["llama3"] |
229 |
| - types_of_llm_apply : llm applied as reranker after the RAG pipeline |
230 |
| - """ |
| 86 | + # Experiment 2 : Evaluating temperature = 1 (default was 0.95) |
| 87 | + exp_2(eval_path, config, subset_ids, query_key_dict) |
231 | 88 |
|
| 89 | + # Experiment 3 : Evaluating search type [mmr, similarity_score_threshold] (default was similarity) |
| 90 | + exp_3(eval_path, config, subset_ids, query_key_dict) |
232 | 91 |
|
233 |
| - list_of_embedding_models = [ |
234 |
| - "BAAI/bge-large-en-v1.5", |
235 |
| - ] |
236 |
| - list_of_llm_models = ["llama3"] |
237 |
| - types_of_llm_apply = [False] |
238 |
| - types_of_chunk = [512, 128] |
239 |
| - for type_of_chunk in types_of_chunk: |
240 |
| - config["chunk_size"] = type_of_chunk |
241 |
| - expRunner = ExperimentRunner( |
242 |
| - config=config, |
243 |
| - eval_path=eval_path, |
244 |
| - queries=query_key_dict.keys(), |
245 |
| - list_of_embedding_models=list_of_embedding_models, |
246 |
| - list_of_llm_models=list_of_llm_models, |
247 |
| - subset_ids=subset_ids, |
248 |
| - use_cached_experiment=True, |
249 |
| - custom_name=f"{type_of_chunk}_chunk", |
250 |
| - types_of_llm_apply=types_of_llm_apply, |
251 |
| - ) |
252 |
| - expRunner.run_experiments() |
253 |
| - |
254 |
| - # reset the search type to the default value |
255 |
| - config["chunk_size"] = 1000 |
| 92 | + # Experiment 4 : Evaluating chunk size. The default is 1000, trying out 512,128 |
| 93 | + exp_4(eval_path, config, subset_ids, query_key_dict) |
0 commit comments