amore-labs
diff --git a/‎backend/backend.py
Lines changed: 5 additions & 5 deletions b/‎backend/backend.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎backend/modules/metadata_utils.py
Lines changed: 2 additions & 0 deletions b/‎backend/modules/metadata_utils.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backend/modules/rag_llm.py
Lines changed: 7 additions & 1 deletion b/‎backend/modules/rag_llm.py
Lines changed: 7 additions & 1 deletion
diff --git a/‎backend/modules/vector_store_utils.py
Lines changed: 17 additions & 4 deletions b/‎backend/modules/vector_store_utils.py
Lines changed: 17 additions & 4 deletions
diff --git a/‎data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/data_level0.bin
9.59 MB b/‎data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/data_level0.bin
9.59 MB
diff --git a/‎data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/header.bin
100 Bytes b/‎data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/header.bin
100 Bytes
diff --git a/‎data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/index_metadata.pickle
338 KB b/‎data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/index_metadata.pickle
338 KB
diff --git a/‎data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/length.bin
23.4 KB b/‎data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/length.bin
23.4 KB
diff --git a/‎data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/link_lists.bin
51.7 KB b/‎data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/link_lists.bin
51.7 KB
diff --git a/‎docs/evaluation/evaluation.md
Lines changed: 1 addition & 1 deletion b/‎docs/evaluation/evaluation.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎evaluation/README.md
Lines changed: 29 additions & 4 deletions b/‎evaluation/README.md
Lines changed: 29 additions & 4 deletions
diff --git a/‎evaluation/evaluate.py
Lines changed: 1 addition & 0 deletions b/‎evaluation/evaluate.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎evaluation/evaluation_utils.py
Lines changed: 6 additions & 5 deletions b/‎evaluation/evaluation_utils.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎evaluation/experiments.py
Lines changed: 20 additions & 8 deletions b/‎evaluation/experiments.py
Lines changed: 20 additions & 8 deletions
diff --git a/‎evaluation/get_elastic_search_results.py
Lines changed: 12 additions & 8 deletions b/‎evaluation/get_elastic_search_results.py
Lines changed: 12 additions & 8 deletions
diff --git a/‎evaluation/run_all_training.py
Lines changed: 12 additions & 3 deletions b/‎evaluation/run_all_training.py
Lines changed: 12 additions & 3 deletions
@@ -17,12 +17,12 @@
     config["data_dir"] = "./data/testing_data/"
 
 # load the persistent database using ChromaDB
-print('Loading DB')
+print("Loading DB")
 client = chromadb.PersistentClient(path=config["persist_dir"])
 # Loading the metadata for all types
 
 # Setup llm chain, initialize the retriever and llm, and setup Retrieval QA
-print('Setting LLM chain')
+print("Setting LLM chain")
 qa_dataset_handler = QASetup(
     config=config,
     data_type="dataset",
@@ -43,7 +43,7 @@
 llm_chain_handler = LLMChainCreator(config=config, local=True)
 llm_chain_handler.enable_cache()
 llm_chain = llm_chain_handler.get_llm_chain()
-print('OK.')
+print("OK.")
 
 # Send test query as first query to avoid cold start
 try:
@@ -68,7 +68,7 @@ async def read_dataset(query: str):
         _, ids_order = QueryProcessor(
             query=query,
             qa=qa_dataset,
-            type_of_query='dataset',
+            type_of_query="dataset",
             config=config,
         ).get_result_from_query()
 
@@ -89,7 +89,7 @@ async def read_flow(query: str):
         _, ids_order = QueryProcessor(
             query=query,
             qa=qa_flow,
-            type_of_query='flow',
+            type_of_query="flow",
             config=config,
         ).get_result_from_query()
 
 
@@ -20,6 +20,7 @@ class OpenMLObjectHandler:
     """
     Description: The base class for handling OpenML objects. The logic for handling datasets/flows are subclasses from this.
     """
+
     def __init__(self, config):
         self.config = config
 
@@ -268,6 +269,7 @@ class OpenMLMetadataProcessor:
     """
     Description: Process metadata using the OpenMLHandlers
     """
+
     def __init__(self, config: dict):
         self.config = config
         self.save_filename = os.path.join(
 
@@ -26,14 +26,18 @@ class LLMChainInitializer:
     """
     Description: Setup the vectordb (Chroma) as a retriever with parameters
     """
+
     @staticmethod
     def initialize_llm_chain(
         vectordb: Chroma, config: dict
     ) -> langchain.chains.retrieval_qa.base.RetrievalQA:
         if config["search_type"] == "similarity_score_threshold":
             return vectordb.as_retriever(
                 search_type=config["search_type"],
-                search_kwargs={"k": config["num_return_documents"], "score_threshold": 0.5},
+                search_kwargs={
+                    "k": config["num_return_documents"],
+                    "score_threshold": 0.5,
+                },
             )
         else:
             return vectordb.as_retriever(
@@ -46,6 +50,7 @@ class QASetup:
     """
     Description: Setup the VectorDB, QA and initalize the LLM for each type of data
     """
+
     def __init__(
         self, config: dict, data_type: str, client: ClientAPI, subset_ids: list = None
     ):
@@ -80,6 +85,7 @@ class LLMChainCreator:
     """
     Description: Gets Ollama, sends query, enables query caching
     """
+
     def __init__(self, config: dict, local: bool = False):
         self.config = config
         self.local = local
 
@@ -14,11 +14,20 @@ class DataLoader:
     """
     Description: Used to chunk data
     """
-    def __init__(self, metadata_df: pd.DataFrame, page_content_column: str, chunk_size:int = 1000, chunk_overlap:int = 150):
+
+    def __init__(
+        self,
+        metadata_df: pd.DataFrame,
+        page_content_column: str,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 150,
+    ):
         self.metadata_df = metadata_df
         self.page_content_column = page_content_column
         self.chunk_size = chunk_size
-        self.chunk_overlap = chunk_overlap if self.chunk_size > chunk_overlap else self.chunk_size
+        self.chunk_overlap = (
+            chunk_overlap if self.chunk_size > chunk_overlap else self.chunk_size
+        )
 
     def load_and_process_data(self) -> list:
         """
@@ -41,6 +50,7 @@ class DocumentProcessor:
     """
     Description: Used to generate unique documents based on text content to prevent duplicates during embedding
     """
+
     @staticmethod
     def generate_unique_documents(documents: list, db: Chroma) -> tuple:
         """
@@ -74,6 +84,7 @@ class VectorStoreManager:
     """
     Description: Manages the Vector store (chromadb) and takes care of data ingestion, loading the embedding model and embedding the data before adding it to the vector store
     """
+
     def __init__(self, chroma_client: ClientAPI, config: dict):
         self.chroma_client = chroma_client
         self.config = config
@@ -145,7 +156,7 @@ def load_vector_store(
         )
 
     @staticmethod
-    def add_documents_to_db(db, unique_docs, unique_ids, bs = 512):
+    def add_documents_to_db(db, unique_docs, unique_ids, bs=512):
         """
         Description: Add documents to Chroma DB in batches of bs
         """
@@ -170,7 +181,9 @@ def create_vector_store(self, metadata_df: pd.DataFrame) -> Chroma:
         )
 
         data_loader = DataLoader(
-            metadata_df, page_content_column="Combined_information", chunk_size = self.config["chunk_size"]
+            metadata_df,
+            page_content_column="Combined_information",
+            chunk_size=self.config["chunk_size"],
         )
         documents = data_loader.load_and_process_data()
 
 
@@ -13,7 +13,7 @@
 
 - It is "pretty easy" to add a new evaluation. 
   - (Note that `training_utils.py` already overloads some classes from the original training. Which means that you can modify this to your hearts content without affecting the main code. Enjoy~)
-  - Step 1: Find the method you want to override and overload the class/method in `training_utils.py`.
+  - Step 1: Find the method you want to override and overload the class/method in `experiments.py`.
   - Step 2: Add some if statements in `class ExperimentRunner` to ensure you dont break everything.
   - Step 3: Follow the ExperimentRunner templates in `run_all_training.py` to add whatever you added in Step 2 as a new experiment.
     - Give it a custom name so it is easy to understand what happens
 
@@ -1,7 +1,32 @@
 # Evaluation of LLM models and techniques
 
 ## How to run
-- Start the language server at the root of this repository with `./start_llm_service.sh` 
-- Run `python run_all_training.py` to train all models (get data, create vector store for each etc)
-- Run `python evaluate.py` to run all evaluations
-- Results are found in in `./evaluation_results.csv`
+- Start the language server at the root of this repository with `./start_llm_service.sh` . This is important, do not skip it.
+- Run `python run_all_training.py` to train all models (get data, create vector store for each etc) and run the models on all possible versions of the queries.
+  - Query templates are in `data/evaluation/query_templates.txt`. Add to this if you want different types of queries.
+- Run `python evaluate.py` to aggregate the results from the previous query. (This does not run the models on the queries) 
+- Results are found in in `./evaluation_results.csv` and `evaluation_results.png`
+- **Important note** : If you want to re-run some experiments because things have changed and if the models that you use are the same but the data/labels are new.
+  - Go to `/data/evaluation/{rag-model}/{llm-model}` and remove/move all the folders under it **except** `chroma_db`. If new data is added, the training loop will take care of adding them to the vector database. But if you remove this, it will take a lot longer for the data to be embedded from scratch.
+
+## How to add a new evaluation
+
+- It is "pretty easy" to add a new evaluation. 
+  - (Note that `training_utils.py` already overloads some classes from the original training. Which means that you can modify this to your hearts content without affecting the main code. Enjoy~)
+  - Step 1: Find the method you want to override and overload the class/method in `experiments.py`.
+  - Step 2: Add some if statements in `class ExperimentRunner` to ensure you dont break everything.
+  - Step 3: Follow the ExperimentRunner templates in `run_all_training.py` to add whatever you added in Step 2 as a new experiment.
+    - Give it a custom name so it is easy to understand what happens
+    - Do not worry, the experiments are cached and won't run again if you have run them before.
+  - Step 4: If you changed something from config, make sure you reset it. Since the file runs in one go, it will affect the following experiments otherwise.
+
+## How to add a new metric
+
+- In `evaluation_utils.py`, go to `class EvaluationProcessor`, add a new function that calculates your metric. (You can use the templates provided)
+- Update the metric in `self.metric_methods`
+- While running the evaluation, add them to your metrics list :
+```python
+metrics = ["precision", "recall", "map"]
+eval_path = Path("../data/evaluation/")
+processor = EvaluationProcessor(eval_path, sort_by=None, metrics=metrics)
+```
@@ -4,6 +4,7 @@
 import pandas as pd
 from evaluation_utils import EvaluationProcessor
 from tqdm import tqdm
+
 metrics = ["precision", "recall", "map"]
 eval_path = Path("../data/evaluation/")
 processor = EvaluationProcessor(eval_path, sort_by=None, metrics=metrics)
 
@@ -9,9 +9,8 @@ class EvaluationProcessor:
     """
     Description: Process all the evaluated results, add the required metrics and save results as a csv/generate plots
     """
-    def __init__(
-        self, eval_path, metrics=None, sort_by="precision"
-    ):
+
+    def __init__(self, eval_path, metrics=None, sort_by="precision"):
         if metrics is None:
             metrics = ["precision", "recall", "map"]
         self.eval_path = eval_path
@@ -25,7 +24,7 @@ def __init__(
         self.metric_methods = {
             "precision": self.add_precision,
             "recall": self.add_recall,
-            "map": self.add_map
+            "map": self.add_map,
         }
 
     def run(self):
@@ -72,7 +71,9 @@ def generate_results(self, csv_files):
                 ]
             ).agg({"y_true": ",".join, "y_pred": ",".join})
 
-            grouped_results_for_y_true_and_pred = self.add_metrics(grouped_results_for_y_true_and_pred)
+            grouped_results_for_y_true_and_pred = self.add_metrics(
+                grouped_results_for_y_true_and_pred
+            )
 
             # aggregate by computing the average of the metrics for each group
             grouped_results_for_y_true_and_pred = (
 
@@ -1,8 +1,8 @@
-
 from training_utils import *
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from tqdm.auto import tqdm
 
+
 def exp_0(process_query_elastic_search, eval_path, query_key_dict):
     """
     EXPERIMENT 0
@@ -21,17 +21,28 @@ def exp_0(process_query_elastic_search, eval_path, query_key_dict):
             # Use ThreadPoolExecutor to parallelize requests
             with ThreadPoolExecutor(max_workers=10) as executor:
                 # Start a future for each query
-                futures = {executor.submit(process_query_elastic_search, query, dataset_id): query for query, dataset_id
-                           in
-                           query_key_dict.items()}
+                futures = {
+                    executor.submit(
+                        process_query_elastic_search, query, dataset_id
+                    ): query
+                    for query, dataset_id in query_key_dict.items()
+                }
 
                 for future in tqdm(as_completed(futures), total=len(futures)):
                     result = future.result()
                     # Save the results to a CSV file
                     for id, query in result:
                         f.write(f"{id},None,{query},es,es,None\n")
 
-def exp_1(eval_path, config, list_of_embedding_models, list_of_llm_models, subset_ids, query_key_dict):
+
+def exp_1(
+    eval_path,
+    config,
+    list_of_embedding_models,
+    list_of_llm_models,
+    subset_ids,
+    query_key_dict,
+):
     """
     EXPERIMENT 1
     Main evaluation loop that is used to run the base experiments using different models and embeddings.
@@ -57,6 +68,7 @@ def exp_1(eval_path, config, list_of_embedding_models, list_of_llm_models, subse
     )
     expRunner.run_experiments()
 
+
 def exp_2(eval_path, config, subset_ids, query_key_dict):
     """
     EXPERIMENT 2
@@ -91,6 +103,7 @@ def exp_2(eval_path, config, subset_ids, query_key_dict):
     # reset the temperature to the default value
     config["temperature"] = 0.95
 
+
 def exp_3(eval_path, config, subset_ids, query_key_dict):
     """
     EXPERIMENT 3
@@ -104,7 +117,6 @@ def exp_3(eval_path, config, subset_ids, query_key_dict):
     types_of_llm_apply : llm applied as reranker after the RAG pipeline
     """
 
-
     list_of_embedding_models = [
         "BAAI/bge-large-en-v1.5",
     ]
@@ -130,6 +142,7 @@ def exp_3(eval_path, config, subset_ids, query_key_dict):
     # reset the search type to the default value
     config["search_type"] = "similarity"
 
+
 def exp_4(eval_path, config, subset_ids, query_key_dict):
     """
     EXPERIMENT 4
@@ -143,7 +156,6 @@ def exp_4(eval_path, config, subset_ids, query_key_dict):
     types_of_llm_apply : llm applied as reranker after the RAG pipeline
     """
 
-
     list_of_embedding_models = [
         "BAAI/bge-large-en-v1.5",
     ]
@@ -166,4 +178,4 @@ def exp_4(eval_path, config, subset_ids, query_key_dict):
         expRunner.run_experiments()
 
     # reset the search type to the default value
-    config["chunk_size"] = 1000
+    config["chunk_size"] = 1000
@@ -1,13 +1,17 @@
-#%%
+# %%
 import requests
-#%%
+
+
+# %%
 def get_elastic_search_results(query):
-    query = query.replace(' ', '%20')
-    url = 'https://es.openml.org/_search?q=' + query
+    query = query.replace(" ", "%20")
+    url = "https://es.openml.org/_search?q=" + query
     response = requests.get(url)
     response_json = response.json()
-    return response_json['hits']['hits']
-#%%
-res = get_elastic_search_results('iris')
+    return response_json["hits"]["hits"]
+
+
+# %%
+res = get_elastic_search_results("iris")
 # %%
-ids = [val['_id'] for val in res]
+ids = [val["_id"] for val in res]
@@ -31,7 +31,7 @@
         "BAAI/bge-large-en-v1.5",
         "BAAI/bge-base-en-v1.5",
         "Snowflake/snowflake-arctic-embed-l",
-        "Alibaba-NLP/gte-large-en-v1.5"
+        "Alibaba-NLP/gte-large-en-v1.5",
     ]
     list_of_llm_models = ["llama3"]
 
@@ -72,7 +72,9 @@
     subset_ids = list(set([int(item) for sublist in subset_ids for item in sublist]))
     # %%
     # get the queries for the datasets
-    query_key_dict = get_queries(query_templates=query_templates, load_eval_queries=load_eval_queries)
+    query_key_dict = get_queries(
+        query_templates=query_templates, load_eval_queries=load_eval_queries
+    )
     json.dump(query_key_dict, open(eval_path / "query_key_dict.json", "w"))
 
     # Run experiments on just queries and not filters
@@ -81,7 +83,14 @@
     exp_0(process_query_elastic_search, eval_path, query_key_dict)
 
     # Experiment 1 : Run the base experiments using different models and embeddings
-    exp_1(eval_path, config, list_of_embedding_models, list_of_llm_models, subset_ids, query_key_dict)
+    exp_1(
+        eval_path,
+        config,
+        list_of_embedding_models,
+        list_of_llm_models,
+        subset_ids,
+        query_key_dict,
+    )
 
     # Experiment 2 : Evaluating temperature = 1 (default was 0.95)
     exp_2(eval_path, config, subset_ids, query_key_dict)