From 8e89e52a9a1931b1a5172c3961b14838be0c1d7a Mon Sep 17 00:00:00 2001
From: SubhadityaMukherjee <msubhaditya@gmail.com>
Date: Mon, 8 Jul 2024 16:56:36 +0200
Subject: [PATCH] fixed some bugs here and there

---
 docs/inference.md                             |    4 +-
 docs/query_llm.md                             |    4 +
 frontend/ui.py                                |   77 +-
 frontend/ui_utils.py                          |  184 ++
 frontend/utils.py                             |   60 -
 llm_service/llm_service.py                    |   57 +-
 site/404.html                                 |   36 +-
 site/configuration/index.html                 |   36 +-
 .../change model/index.html                   |   36 +-
 .../create vectordb/index.html                |   36 +-
 .../get an llm summary/index.html             |   36 +-
 site/developer tutorials/index.html           |   36 +-
 .../load vectordb and get results/index.html  |   36 +-
 .../index.html                                |  385 +---
 .../train and evaluate models/index.html      |  622 +----
 site/docker/index.html                        |   36 +-
 site/index.html                               |   36 +-
 site/inference/index.html                     |  772 ++++++-
 site/modules/general_utils/index.html         |   60 +-
 site/modules/llm_module/index.html            |  220 +-
 site/modules/metadata_module/index.html       | 2002 ++++++++++-------
 site/modules/result_gen/index.html            |  214 +-
 site/objects.inv                              |  Bin 689 -> 892 bytes
 site/query_llm/index.html                     | 1001 +++++++++
 site/search/search_index.json                 |    2 +-
 site/sitemap.xml.gz                           |  Bin 127 -> 127 bytes
 site/testing/index.html                       |  150 +-
 site/training/index.html                      |   36 +-
 28 files changed, 3935 insertions(+), 2239 deletions(-)
 create mode 100644 docs/query_llm.md
 create mode 100644 frontend/ui_utils.py
 delete mode 100644 frontend/utils.py
 create mode 100644 site/query_llm/index.html

diff --git a/docs/inference.md b/docs/inference.md
index 50e0a71..85ccb65 100644
--- a/docs/inference.md
+++ b/docs/inference.md
@@ -9,4 +9,6 @@
   
 
 ## Errors
-- If you get an error about file permissions, run `chmod +x start_local.sh` and `chmod +x stop_local.sh` to make them executable.
\ No newline at end of file
+- If you get an error about file permissions, run `chmod +x start_local.sh` and `chmod +x stop_local.sh` to make them executable.
+
+:::ui_utils
\ No newline at end of file
diff --git a/docs/query_llm.md b/docs/query_llm.md
new file mode 100644
index 0000000..b3c3bd1
--- /dev/null
+++ b/docs/query_llm.md
@@ -0,0 +1,4 @@
+# LLM Query parsing
+
+- The LLM reads the query and parses it into a list of filters based on a prompt
+:::llm_service
diff --git a/frontend/ui.py b/frontend/ui.py
index 2d585d4..65917ab 100644
--- a/frontend/ui.py
+++ b/frontend/ui.py
@@ -2,11 +2,9 @@
 from pathlib import Path
 
 import pandas as pd
-import requests
 import streamlit as st
-from streamlit_feedback import streamlit_feedback
-from utils import (feedback_cb, filter_initial_response, parse_llm_response,
-                   update_subset_cols)
+from ui_utils import run_streamlit
+
 
 with open("../backend/config.json", "r") as file:
     config = json.load(file)
@@ -19,6 +17,7 @@
 data_metadata = pd.read_csv(data_metadata)
 flow_metadata = pd.read_csv(flow_metadata)
 
+
 # Main Streamlit App
 st.title("OpenML AI Search")
 
@@ -28,72 +27,4 @@
 st.session_state["query"] = query
 
 response = {"initial_response": None}
-
-if st.button("Submit"):
-    with st.spinner("Waiting for results..."):
-        try:
-            response = requests.get(
-                f"http://fastapi:8000/{query_type.lower()}/{query}",
-                json={"query": query, "type": query_type.lower()},
-            ).json()
-        except:
-            response = requests.get(
-                f"http://0.0.0.0:8000/{query_type.lower()}/{query}",
-                json={"query": query, "type": query_type.lower()},
-            ).json()
-
-    if response["initial_response"] is not None:
-        st.write("Results:")
-
-        # response is the ids, we need to get the metdata from the ids
-        if query_type == "Dataset":
-            initial_response = data_metadata[
-                data_metadata["did"].isin(response["initial_response"])
-            ]
-            # subset_cols = ["did", "name","OpenML URL","Description", "command"]
-        else:
-            initial_response = flow_metadata[
-                flow_metadata["id"].isin(response["initial_response"])
-            ]
-
-        # def process query using results from port 8001/llmquery/{query}
-        with st.spinner("Using an LLM to find the most relevent information..."):
-            if query_type == "Dataset":
-                try:
-                    llm_response = requests.get(
-                        f"http://fastapi:8081/llmquery/{query}"
-                    ).json()
-                except:
-                    llm_response = requests.get(
-                        f"http://0.0.0.0:8081/llmquery/{query}"
-                    ).json()
-
-                subset_cols = ["did", "name"]
-                try:
-                    (
-                        dataset_size,
-                        dataset_missing,
-                        dataset_classification,
-                        dataset_sort,
-                    ) = parse_llm_response(llm_response)
-                    subset_cols = update_subset_cols(
-                        dataset_size, dataset_missing, dataset_classification
-                    )
-                    initial_response = filter_initial_response(
-                        initial_response, dataset_classification
-                    )
-                except Exception as e:
-                    st.error(f"Error processing LLM response: {e}")
-
-                initial_response = initial_response[subset_cols]
-              
-                st.dataframe(initial_response)
-
-    with st.form("fb_form"):
-        streamlit_feedback(
-            feedback_type="thumbs",
-            align="flex-start",
-            key="fb_k",
-            optional_text_label="[Optional] Please provide an explanation",
-        )
-        st.form_submit_button("Save feedback", on_click=feedback_cb)
+run_streamlit()
\ No newline at end of file
diff --git a/frontend/ui_utils.py b/frontend/ui_utils.py
new file mode 100644
index 0000000..015666e
--- /dev/null
+++ b/frontend/ui_utils.py
@@ -0,0 +1,184 @@
+import json
+import os
+
+from streamlit import session_state as ss
+from streamlit_feedback import streamlit_feedback
+import requests
+import streamlit as st
+
+def feedback_cb():
+    """
+    Description: Callback function to save feedback to a file
+
+    Input: None
+
+    Returns: None
+    """
+    file_path = "feedback.json"
+
+    if os.path.exists(file_path):
+        with open(file_path, "r") as file:
+            try:
+                data = json.load(file)
+            except json.JSONDecodeError:
+                data = []
+    else:
+        data = []
+
+    # Append new feedback
+    data.append({"ss": ss.fb_k, "query": ss.query})
+
+    # Write updated content back to the file
+    with open(file_path, "w") as file:
+        json.dump(data, file, indent=4)
+
+
+def parse_llm_response(response):
+    """
+    Description: Parse the answers from the LLM response
+    
+    Input: response (dict)
+    
+    Returns: size (str), missing (str), classification (str), sort (str)
+    """
+    size, missing, classification = response["answers"]
+    size, sort = size.split(",") if "," in size else (size, None)
+    return size, missing, classification, sort
+
+
+def update_subset_cols(size, missing, classification):
+    """
+    Description: Update the subset columns based on LLM's response
+    
+    Input: size (str), missing (str), classification (str)
+    
+    Returns: cols (list)
+    """
+    cols = ["did", "name"]
+    if size == "yes":
+        cols.append("NumberOfInstances")
+    if missing == "yes":
+        cols.append("NumberOfMissingValues")
+    if classification != "none":
+        cols.append("NumberOfClasses")
+    return cols
+
+
+def filter_initial_response(response, classification):
+    """
+    Description: Filter the initial response based on the classification
+    
+    Input: response (DataFrame), classification (str)
+    
+    Returns: response (DataFrame)
+    """
+    if classification != "none":
+        if "multi" in classification:
+            response = response[response["NumberOfClasses"] > 2]
+        elif "binary" in classification:
+            response = response[response["NumberOfClasses"] == 2]
+    return response
+
+
+def fetch_response(query_type, query):
+    """
+    Description: Fetch the response from the FastAPI service
+    
+    Input: query_type (str), query (str)
+    
+    Returns: response (dict)
+    """
+    try:
+        response = requests.get(
+            f"http://fastapi:8000/{query_type.lower()}/{query}",
+            json={"query": query, "type": query_type.lower()},
+        ).json()
+    except:
+        response = requests.get(
+            f"http://0.0.0.0:8000/{query_type.lower()}/{query}",
+            json={"query": query, "type": query_type.lower()},
+        ).json()
+    return response
+
+def fetch_llm_response(query):
+    """
+    Description: Fetch the response from the LLM service
+    
+    Input: query (str)
+    
+    Returns: llm_response (dict)
+    """
+    try:
+        llm_response = requests.get(f"http://fastapi:8081/llmquery/{query}").json()
+    except:
+        llm_response = requests.get(f"http://0.0.0.0:8081/llmquery/{query}").json()
+    return llm_response
+
+def parse_and_update_response(query_type, response, llm_response, data_metadata, flow_metadata):
+    """
+    Description: Parse and update the response based on the query type
+    
+    Input: query_type (str), response (dict), llm_response (dict), data_metadata (DataFrame), flow_metadata (DataFrame)
+    
+    Returns: initial_response (DataFrame)
+    """
+    if query_type == "Dataset":
+        initial_response = data_metadata[data_metadata["did"].isin(response["initial_response"])]
+        subset_cols = ["did", "name"]
+        try:
+            dataset_size, dataset_missing, dataset_classification, dataset_sort = parse_llm_response(llm_response)
+            subset_cols = update_subset_cols(dataset_size, dataset_missing, dataset_classification)
+            initial_response = filter_initial_response(initial_response, dataset_classification)
+        except Exception as e:
+            st.error(f"Error processing LLM response: {e}")
+        initial_response = initial_response[subset_cols]
+    else:
+        initial_response = flow_metadata[flow_metadata["id"].isin(response["initial_response"])]
+    return initial_response
+
+def display_results(initial_response):
+    """
+    Description: Display the results in a DataFrame
+    
+    Input: initial_response (DataFrame)
+    
+    Returns: None
+    """
+    st.write("Results:")
+    st.dataframe(initial_response)
+
+def run_streamlit():
+    """
+    Description: Run the Streamlit app
+    
+    Input: None
+    
+    Returns: None
+    """
+    if st.button("Submit"):
+        with st.spinner("Waiting for results..."):
+            query_type = st.session_state['query_type']
+            query = st.session_state['query']
+            data_metadata = st.session_state['data_metadata']
+            flow_metadata = st.session_state['flow_metadata']
+            
+            response = fetch_response(query_type, query)
+
+        if response["initial_response"] is not None:
+            if query_type == "Dataset":
+                with st.spinner("Using an LLM to find the most relevant information..."):
+                    llm_response = fetch_llm_response(query)
+                    initial_response = parse_and_update_response(query_type, response, llm_response, data_metadata, flow_metadata)
+            else:
+                initial_response = parse_and_update_response(query_type, response, None, data_metadata, flow_metadata)
+
+            display_results(initial_response)
+
+        with st.form("fb_form"):
+            streamlit_feedback(
+                feedback_type="thumbs",
+                align="flex-start",
+                key="fb_k",
+                optional_text_label="[Optional] Please provide an explanation",
+            )
+            st.form_submit_button("Save feedback", on_click=feedback_cb)
\ No newline at end of file
diff --git a/frontend/utils.py b/frontend/utils.py
deleted file mode 100644
index a8bdc49..0000000
--- a/frontend/utils.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import json
-import os
-
-from streamlit import session_state as ss
-
-
-def feedback_cb():
-    """
-    Description: Callback function to save feedback to a file
-
-    Input: None
-
-    Returns: None
-    """
-    file_path = "feedback.json"
-
-    if os.path.exists(file_path):
-        with open(file_path, "r") as file:
-            try:
-                data = json.load(file)
-            except json.JSONDecodeError:
-                data = []
-    else:
-        data = []
-
-    # Append new feedback
-    data.append({"ss": ss.fb_k, "query": ss.query})
-
-    # Write updated content back to the file
-    with open(file_path, "w") as file:
-        json.dump(data, file, indent=4)
-
-
-def parse_llm_response(response):
-    # Define the function to parse the llm_response
-    size, missing, classification = response["answers"]
-    size, sort = size.split(",") if "," in size else (size, None)
-    return size, missing, classification, sort
-
-
-def update_subset_cols(size, missing, classification):
-    # Define the function to update the subset columns
-    cols = ["did", "name"]
-    if size == "yes":
-        cols.append("NumberOfInstances")
-    if missing == "yes":
-        cols.append("NumberOfMissingValues")
-    if classification != "none":
-        cols.append("NumberOfClasses")
-    return cols
-
-
-def filter_initial_response(response, classification):
-    # Define the function to filter the initial response based on classification
-    if classification != "none":
-        if "multi" in classification:
-            response = response[response["NumberOfClasses"] > 2]
-        elif "binary" in classification:
-            response = response[response["NumberOfClasses"] == 2]
-    return response
diff --git a/llm_service/llm_service.py b/llm_service/llm_service.py
index 5881eec..48e895a 100644
--- a/llm_service/llm_service.py
+++ b/llm_service/llm_service.py
@@ -2,7 +2,7 @@
 from fastapi import FastAPI
 from fastapi.responses import JSONResponse
 from httpx import ConnectTimeout
-from langchain_community.chat_models import ChatOllama
+from langchain_community.chat_models.ollama import ChatOllama
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from tenacity import retry, retry_if_exception_type, stop_after_attempt
@@ -16,37 +16,51 @@
 
 
 def create_chain(prompt, model="llama3", temperature=0):
+    """
+    Description: Create a chain with the given prompt and model
+    
+    Input: prompt (str), model (str), temperature (float)
+    
+    Returns: chain (Chain)
+    """
     llm = ChatOllama(model=model, temperature=temperature)
     prompt = ChatPromptTemplate.from_template(prompt)
 
-    # using LangChain Expressive Language chain syntax
-    # learn more about the LCEL on
-    # /docs/concepts/#langchain-expression-language-lcel
     return prompt | llm | StrOutputParser()
 
 
 def parse_answers_initial(response):
-    # for each line in the response, split by ? and check if the response is Yes/No or a comma separated string of Yes/No or ascending/descending using regex
+    """
+    Description: Parse the answers from the initial response
+    
+    Input: response (str)
+    
+    Returns: answers (list)
+    """
+    patterns = [
+        r"^(yes|no|none)",
+        r"^(ascending|descending)",
+        r"(multi-class|binary|multi-label)"
+    ]
+    
     answers = []
-    for line in response.lower().split("\n"):
+    lines = response.lower().split("\n")
+    
+    for line in lines:
         if "?" in line:
-            response = line.split("?")[1].strip()
-            if response in ["yes", "no", "none"]:
-                answers.append(response)
-            # elif re.match(r"^(Yes|No),\s?(Yes|No)$", response):
-            # match for Yes/No or ascending/descending and full stop
-            elif re.match(r"^(yes|no)", response):
-                answers.append(response)
-            elif re.match(r"^(ascending|descending)", response):
-                answers.append(response)
-            elif re.match(r"(multi-class|binary|multi-label)", response):
-                answers.append(response)
-        # if any of the words are in the line, append the line to the answers
-        elif any(word in line for word in ["yes", "no", "none", "ascending", "descending", "multi-class", "binary", "multi-label"]):
-            answers.append(line.strip())
+            # Extract the part of the line after the question mark
+            potential_answer = line.split("?")[1].strip()
+        else:
+            potential_answer = line.strip()
+        
+        # Check if the potential answer matches any of the patterns
+        for pattern in patterns:
+            if re.match(pattern, potential_answer):
+                answers.append(potential_answer)
+                break  # Stop checking other patterns if a match is found
+    
     return answers
 
-
 chain = create_chain(prompt)
 
 app = FastAPI()
@@ -58,6 +72,5 @@ def parse_answers_initial(response):
 async def get_llm_query(query: str):
     query = query.replace("%20", " ")
     response = chain.invoke({"query": query})
-    print(response)
     answers = parse_answers_initial(response)
     return JSONResponse(content={"answers": answers})
diff --git a/site/404.html b/site/404.html
index a543db0..dadbfa9 100644
--- a/site/404.html
+++ b/site/404.html
@@ -269,6 +269,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="/query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="/testing/" class="md-nav__link">
         
@@ -317,10 +337,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -331,8 +351,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -506,10 +526,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -520,8 +540,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
diff --git a/site/configuration/index.html b/site/configuration/index.html
index 350b112..2032fed 100644
--- a/site/configuration/index.html
+++ b/site/configuration/index.html
@@ -288,6 +288,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../testing/" class="md-nav__link">
         
@@ -336,10 +356,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -350,8 +370,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -525,10 +545,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -539,8 +559,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
diff --git a/site/developer tutorials/change model/index.html b/site/developer tutorials/change model/index.html
index 4562281..4615319 100644
--- a/site/developer tutorials/change model/index.html	
+++ b/site/developer tutorials/change model/index.html	
@@ -278,6 +278,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../testing/" class="md-nav__link">
         
@@ -328,10 +348,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" checked>
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" checked>
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -342,8 +362,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="true">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -584,10 +604,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -598,8 +618,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
diff --git a/site/developer tutorials/create vectordb/index.html b/site/developer tutorials/create vectordb/index.html
index 9a8bb36..c7201de 100644
--- a/site/developer tutorials/create vectordb/index.html	
+++ b/site/developer tutorials/create vectordb/index.html	
@@ -278,6 +278,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../testing/" class="md-nav__link">
         
@@ -328,10 +348,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" checked>
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" checked>
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -342,8 +362,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="true">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -575,10 +595,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -589,8 +609,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
diff --git a/site/developer tutorials/get an llm summary/index.html b/site/developer tutorials/get an llm summary/index.html
index 2d85756..314ae7f 100644
--- a/site/developer tutorials/get an llm summary/index.html	
+++ b/site/developer tutorials/get an llm summary/index.html	
@@ -278,6 +278,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../testing/" class="md-nav__link">
         
@@ -328,10 +348,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" checked>
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" checked>
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -342,8 +362,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="true">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -527,10 +547,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -541,8 +561,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
diff --git a/site/developer tutorials/index.html b/site/developer tutorials/index.html
index 5eeea64..fc17933 100644
--- a/site/developer tutorials/index.html	
+++ b/site/developer tutorials/index.html	
@@ -278,6 +278,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../testing/" class="md-nav__link">
         
@@ -328,10 +348,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" checked>
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" checked>
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -342,8 +362,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="true">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -527,10 +547,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -541,8 +561,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
diff --git a/site/developer tutorials/load vectordb and get results/index.html b/site/developer tutorials/load vectordb and get results/index.html
index f376d6f..6a88f28 100644
--- a/site/developer tutorials/load vectordb and get results/index.html	
+++ b/site/developer tutorials/load vectordb and get results/index.html	
@@ -278,6 +278,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../testing/" class="md-nav__link">
         
@@ -328,10 +348,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" checked>
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" checked>
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -342,8 +362,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="true">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -575,10 +595,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -589,8 +609,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
diff --git a/site/developer tutorials/run multiple queries and aggregate/index.html b/site/developer tutorials/run multiple queries and aggregate/index.html
index ae65e0d..f9f21f0 100644
--- a/site/developer tutorials/run multiple queries and aggregate/index.html	
+++ b/site/developer tutorials/run multiple queries and aggregate/index.html	
@@ -278,6 +278,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../testing/" class="md-nav__link">
         
@@ -328,10 +348,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" checked>
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" checked>
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -342,8 +362,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="true">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -571,10 +591,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -585,8 +605,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
@@ -822,19 +842,6 @@ <h1>Run multiple queries and aggregate</h1>
 <span class="n">config</span><span class="p">[</span><span class="s2">&quot;data_dir&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;../../backend/data/&quot;</span>
 </code></pre></div></td></tr></table></div>
 
-</div>
-<div class="output_wrapper">
-<div class="output">
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>[INFO] Finding device.
-[INFO] Device found: cpu
-</code>
-</pre>
-</div>
-</div>
-</div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
@@ -855,36 +862,6 @@ <h1>Run multiple queries and aggregate</h1>
 <span class="n">qa_dataset</span> <span class="o">=</span> <span class="n">setup_vector_db_and_qa</span><span class="p">(</span><span class="n">config</span><span class="o">=</span><span class="n">config</span><span class="p">,</span> <span class="n">data_type</span><span class="o">=</span><span class="s2">&quot;dataset&quot;</span><span class="p">,</span> <span class="n">client</span><span class="o">=</span><span class="n">client</span><span class="p">)</span>
 </code></pre></div></td></tr></table></div>
 
-</div>
-<div class="output_wrapper">
-<div class="output">
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>[INFO] Loading metadata from file.
-[INFO] Loading model...
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>/Users/eragon/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
-  warnings.warn(
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>[INFO] Model loaded.
-</code>
-</pre>
-</div>
-</div>
-</div>
 </div>
 </div>
 <div class="cell border-box-sizing text_cell rendered">
@@ -902,88 +879,6 @@ <h2 id="aggregate-results">Aggregate results<a class="headerlink" href="#aggrega
 <span class="n">combined_df</span> <span class="o">=</span> <span class="n">aggregate_multiple_queries_and_count</span><span class="p">(</span><span class="n">queries</span><span class="p">,</span><span class="n">qa_dataset</span><span class="o">=</span><span class="n">qa_dataset</span><span class="p">,</span> <span class="n">config</span><span class="o">=</span><span class="n">config</span><span class="p">,</span> <span class="n">group_cols</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">],</span> <span class="n">sort_by</span><span class="o">=</span><span class="s2">&quot;query&quot;</span><span class="p">,</span> <span class="n">count</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span>
 </code></pre></div></td></tr></table></div>
 
-</div>
-<div class="output_wrapper">
-<div class="output">
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>  0%|          | 0/5 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code> 20%|██        | 1/5 [00:02&lt;00:08,  2.14s/it]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code> 40%|████      | 2/5 [00:02&lt;00:03,  1.17s/it]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code> 60%|██████    | 3/5 [00:03&lt;00:01,  1.14it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code> 80%|████████  | 4/5 [00:03&lt;00:00,  1.32it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>100%|██████████| 5/5 [00:03&lt;00:00,  1.26it/s]
-</code>
-</pre>
-</div>
-</div>
-</div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
@@ -992,71 +887,6 @@ <h2 id="aggregate-results">Aggregate results<a class="headerlink" href="#aggrega
 <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="n">combined_df</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
 </code></pre></div></td></tr></table></div>
 
-</div>
-<div class="output_wrapper">
-<div class="output">
-<div class="output_area">
-<div class="output_html rendered_html output_subarea output_execute_result">
-<div>
-<style scoped="">
-    .dataframe tbody tr th:only-of-type {
-        vertical-align: middle;
-    }
-
-    .dataframe tbody tr th {
-        vertical-align: top;
-    }
-
-    .dataframe thead th {
-        text-align: right;
-    }
-</style>
-<table border="1" class="dataframe">
-<thead>
-<tr style="text-align: right;">
-<th></th>
-<th>id</th>
-<th>name</th>
-<th>query</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<th>36</th>
-<td>43495</td>
-<td>COVID-19-Mexico-Clean--Order-by-States</td>
-<td>4</td>
-</tr>
-<tr>
-<th>52</th>
-<td>43844</td>
-<td>Coronavirus-Worldwide-Dataset</td>
-<td>4</td>
-</tr>
-<tr>
-<th>26</th>
-<td>43349</td>
-<td>COVID-19-World-Vaccination-Progress</td>
-<td>4</td>
-</tr>
-<tr>
-<th>27</th>
-<td>43365</td>
-<td>Covid-19-Case-Surveillance-Public-Use-Dataset</td>
-<td>4</td>
-</tr>
-<tr>
-<th>28</th>
-<td>43367</td>
-<td>COVID-19-Indonesia-Dataset</td>
-<td>4</td>
-</tr>
-</tbody>
-</table>
-</div>
-</div>
-</div>
-</div>
 </div>
 </div>
 <div class="cell border-box-sizing text_cell rendered">
@@ -1074,88 +904,6 @@ <h2 id="just-collate">Just collate<a class="headerlink" href="#just-collate" tit
 <span class="n">combined_df</span> <span class="o">=</span> <span class="n">aggregate_multiple_queries_and_count</span><span class="p">(</span><span class="n">queries</span><span class="p">,</span><span class="n">qa_dataset</span><span class="o">=</span><span class="n">qa_dataset</span><span class="p">,</span> <span class="n">config</span><span class="o">=</span><span class="n">config</span><span class="p">,</span> <span class="n">group_cols</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">],</span> <span class="n">sort_by</span><span class="o">=</span><span class="s2">&quot;query&quot;</span><span class="p">,</span> <span class="n">count</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span>
 </code></pre></div></td></tr></table></div>
 
-</div>
-<div class="output_wrapper">
-<div class="output">
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>  0%|          | 0/5 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code> 20%|██        | 1/5 [00:00&lt;00:02,  1.43it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code> 40%|████      | 2/5 [00:01&lt;00:01,  1.99it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code> 60%|██████    | 3/5 [00:01&lt;00:01,  1.50it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code> 80%|████████  | 4/5 [00:02&lt;00:00,  1.85it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>100%|██████████| 5/5 [00:02&lt;00:00,  2.01it/s]
-</code>
-</pre>
-</div>
-</div>
-</div>
 </div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
@@ -1164,83 +912,6 @@ <h2 id="just-collate">Just collate<a class="headerlink" href="#just-collate" tit
 <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="n">combined_df</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
 </code></pre></div></td></tr></table></div>
 
-</div>
-<div class="output_wrapper">
-<div class="output">
-<div class="output_area">
-<div class="output_html rendered_html output_subarea output_execute_result">
-<div>
-<style scoped="">
-    .dataframe tbody tr th:only-of-type {
-        vertical-align: middle;
-    }
-
-    .dataframe tbody tr th {
-        vertical-align: top;
-    }
-
-    .dataframe thead th {
-        text-align: right;
-    }
-</style>
-<table border="1" class="dataframe">
-<thead>
-<tr style="text-align: right;">
-<th></th>
-<th>id</th>
-<th>name</th>
-<th>query</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<th>0</th>
-<td>43410</td>
-<td>Coronavirus-Disease-(COVID-19)</td>
-<td>Find datasets related to COVID-19</td>
-</tr>
-<tr>
-<th>1</th>
-<td>43412</td>
-<td>COVID-19-Visualisation-and-Epidemic-Analysis-Data</td>
-<td>Find datasets related to COVID-19</td>
-</tr>
-<tr>
-<th>2</th>
-<td>43365</td>
-<td>Covid-19-Case-Surveillance-Public-Use-Dataset</td>
-<td>Find datasets related to COVID-19</td>
-</tr>
-<tr>
-<th>3</th>
-<td>43367</td>
-<td>COVID-19-Indonesia-Dataset</td>
-<td>Find datasets related to COVID-19</td>
-</tr>
-<tr>
-<th>4</th>
-<td>43684</td>
-<td>COVID-19-Stats-and-Mobility-Trends</td>
-<td>Find datasets related to COVID-19</td>
-</tr>
-</tbody>
-</table>
-</div>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_text output_error">
-<pre>
-<span class="ansi-red-intense-fg ansi-bold">The Kernel crashed while executing code in the current cell or a previous cell. 
-</span>
-<span class="ansi-red-intense-fg ansi-bold">Please review the code in the cell(s) to identify a possible cause of the failure. 
-</span>
-<span class="ansi-red-intense-fg ansi-bold">Click &lt;a href='https://aka.ms/vscodeJupyterKernelCrash'&gt;here&lt;/a&gt; for more info. 
-</span>
-<span class="ansi-red-intense-fg ansi-bold">View Jupyter &lt;a href='command:jupyter.viewOutput'&gt;log&lt;/a&gt; for further details.</span></pre>
-</div>
-</div>
-</div>
 </div>
 </div>
 
diff --git a/site/developer tutorials/train and evaluate models/index.html b/site/developer tutorials/train and evaluate models/index.html
index 37bd82b..b5f1913 100644
--- a/site/developer tutorials/train and evaluate models/index.html	
+++ b/site/developer tutorials/train and evaluate models/index.html	
@@ -278,6 +278,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../testing/" class="md-nav__link">
         
@@ -328,10 +348,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" checked>
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" checked>
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -342,8 +362,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="true">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -602,10 +622,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -616,8 +636,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
@@ -904,27 +924,6 @@ <h2 id="setting-the-config">Setting the config<a class="headerlink" href="#setti
 <span class="n">config</span><span class="p">[</span><span class="s2">&quot;training&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
 </code></pre></div></td></tr></table></div>
 
-</div>
-<div class="output_wrapper">
-<div class="output">
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>[INFO] Finding device.
-[INFO] Device found: mps
-</code>
-</pre>
-</div>
-</div>
-</div>
-</div>
-</div>
-<div class="cell border-box-sizing code_cell rendered">
-<div class="input">
-
-<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="n">config</span><span class="p">[</span><span class="s2">&quot;device&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;cpu&quot;</span> <span class="c1"># for testing</span>
-</code></pre></div></td></tr></table></div>
-
 </div>
 </div>
 <div class="cell border-box-sizing text_cell rendered">
@@ -942,7 +941,9 @@ <h2 id="defining-the-models-used">Defining the models used<a class="headerlink"
 <div class="input">
 
 <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span>
-<span class="normal">2</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="n">list_of_embedding_models</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;BAAI/bge-small-en-v1.5&quot;</span><span class="p">]</span>
+<span class="normal">2</span>
+<span class="normal">3</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="c1"># list_of_embedding_models = [&quot;BAAI/bge-small-en-v1.5&quot;, &quot;Alibaba-NLP/gte-Qwen2-1.5B-instruct&quot;]</span>
+<span class="n">list_of_embedding_models</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;GritLM/GritLM-7B&quot;</span><span class="p">]</span>
 <span class="n">list_of_llm_models</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;qwen2:1.5b&quot;</span><span class="p">,</span> <span class="s2">&quot;phi3&quot;</span><span class="p">]</span>
 </code></pre></div></td></tr></table></div>
 
@@ -1044,47 +1045,6 @@ <h2 id="downloading-the-models">Downloading the models<a class="headerlink" href
     <span class="n">os</span><span class="o">.</span><span class="n">system</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;ollama pull </span><span class="si">{</span><span class="n">llm_model</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 </code></pre></div></td></tr></table></div>
 
-</div>
-<div class="output_wrapper">
-<div class="output">
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>Waiting for Ollama server to be active...
-NAME            ID              SIZE    MODIFIED       
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>Error: listen tcp 127.0.0.1:11434: bind: address already in use
-pulling manifest ⠋ pulling manifest ⠙ pulling manifest ⠹ pulling manifest ⠸ pulling manifest ⠼ pulling manifest ⠴ pulling manifest 
-pulling 405b56374e02... 100% ▕████████████████▏ 934 MB                         
-pulling 62fbfd9ed093... 100% ▕████████████████▏  182 B                         
-pulling c156170b718e... 100% ▕████████████████▏  11 KB                         
-pulling f02dd72bb242... 100% ▕████████████████▏   59 B                         
-pulling c9f5e9ffbc5f... 100% ▕████████████████▏  485 B                         
-verifying sha256 digest 
-writing manifest 
-removing any unused layers 
-success 
-pulling manifest ⠋ pulling manifest ⠙ pulling manifest ⠹ pulling manifest ⠸ pulling manifest ⠼ pulling manifest 
-pulling b26e6713dc74... 100% ▕████████████████▏ 2.4 GB                         
-pulling fa8235e5b48f... 100% ▕████████████████▏ 1.1 KB                         
-pulling 542b217f179c... 100% ▕████████████████▏  148 B                         
-pulling 8dde1baf1db0... 100% ▕████████████████▏   78 B                         
-pulling f91db7a2deb9... 100% ▕████████████████▏  485 B                         
-verifying sha256 digest 
-writing manifest 
-removing any unused layers 
-success 
-</code>
-</pre>
-</div>
-</div>
-</div>
 </div>
 </div>
 <div class="cell border-box-sizing text_cell rendered">
@@ -1149,7 +1109,9 @@ <h2 id="running-the-steps">Running the steps<a class="headerlink" href="#running
 <span class="normal">32</span>
 <span class="normal">33</span>
 <span class="normal">34</span>
-<span class="normal">35</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">for</span> <span class="n">embedding_model</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="n">list_of_embedding_models</span><span class="p">,</span> <span class="n">desc</span><span class="o">=</span><span class="s2">&quot;Embedding Models&quot;</span><span class="p">,</span> <span class="n">total</span><span class="o">=</span><span class="nb">len</span><span class="p">(</span><span class="n">list_of_embedding_models</span><span class="p">)):</span>
+<span class="normal">35</span>
+<span class="normal">36</span>
+<span class="normal">37</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">for</span> <span class="n">embedding_model</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="n">list_of_embedding_models</span><span class="p">,</span> <span class="n">desc</span><span class="o">=</span><span class="s2">&quot;Embedding Models&quot;</span><span class="p">,</span> <span class="n">total</span><span class="o">=</span><span class="nb">len</span><span class="p">(</span><span class="n">list_of_embedding_models</span><span class="p">)):</span>
     <span class="k">for</span> <span class="n">llm_model</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="n">list_of_llm_models</span><span class="p">,</span> <span class="n">desc</span><span class="o">=</span><span class="s2">&quot;LLM Models&quot;</span><span class="p">,</span> <span class="n">total</span><span class="o">=</span><span class="nb">len</span><span class="p">(</span><span class="n">list_of_llm_models</span><span class="p">)):</span>
         <span class="c1"># update the config with the new embedding and llm models</span>
         <span class="n">config</span><span class="p">[</span><span class="s2">&quot;embedding_model&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">embedding_model</span>
@@ -1183,519 +1145,19 @@ <h2 id="running-the-steps">Running the steps<a class="headerlink" href="#running
         <span class="c1"># TODO : Replace this evaluation with a more meaningful one</span>
         <span class="n">combined_df</span> <span class="o">=</span> <span class="n">aggregate_multiple_queries_and_count</span><span class="p">(</span><span class="n">queries</span><span class="p">,</span><span class="n">qa_dataset</span><span class="o">=</span><span class="n">qa_dataset</span><span class="p">,</span> <span class="n">config</span><span class="o">=</span><span class="n">config</span><span class="p">,</span> <span class="n">group_cols</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">],</span> <span class="n">sort_by</span><span class="o">=</span><span class="s2">&quot;query&quot;</span><span class="p">,</span> <span class="n">count</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span>
 
+        <span class="c1"># TODO : ADD LLM evaluation here when the function is ready</span>
+
         <span class="n">combined_df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">experiment_path</span> <span class="o">/</span> <span class="s2">&quot;results.csv&quot;</span><span class="p">)</span>
 </code></pre></div></td></tr></table></div>
 
-</div>
-<div class="output_wrapper">
-<div class="output">
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>Embedding Models:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>[INFO] Training is set to True.
-[INFO] Subsetting the data to 100 rows.
-[INFO] Initializing cache.
-[INFO] Getting dataset metadata from OpenML.
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>QUEUEING TASKS | :   0%|          | 0/100 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>PROCESSING TASKS | :   0%|          | 0/100 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>COLLECTING RESULTS | :   0%|          | 0/100 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>[INFO] Saving metadata to file.
-[INFO] Loading model...
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>/Users/eragon/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
-  warnings.warn(
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>[INFO] Model loaded.
-[INFO] Generating unique documents. Total documents: 992
-Number of unique documents: 967 vs Total documents: 992
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/16 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/15 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>
-100%|██████████| 2/2 [03:06&lt;00:00, 93.04s/it]
-
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>
-100%|██████████| 5/5 [00:00&lt;00:00,  8.16it/s]
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>[INFO] Training is set to True.
-[INFO] Subsetting the data to 100 rows.
-[INFO] Initializing cache.
-[INFO] Getting dataset metadata from OpenML.
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>QUEUEING TASKS | :   0%|          | 0/100 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>PROCESSING TASKS | :   0%|          | 0/100 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>COLLECTING RESULTS | :   0%|          | 0/100 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>[INFO] Saving metadata to file.
-[INFO] Loading model...
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>/Users/eragon/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
-  warnings.warn(
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stdout output_text">
-<pre>
-<code>[INFO] Model loaded.
-[INFO] Generating unique documents. Total documents: 992
-Number of unique documents: 967 vs Total documents: 992
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_text output_subarea">
-<pre>
-<code>Batches:   0%|          | 0/16 [00:00&lt;?, ?it/s]</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_stream output_stderr output_text">
-<pre>
-<code>  0%|          | 0/2 [00:43&lt;?, ?it/s]
-LLM Models:  50%|█████     | 1/2 [04:37&lt;04:37, 277.59s/it]
-Embedding Models:   0%|          | 0/1 [04:37&lt;?, ?it/s]
-</code>
-</pre>
-</div>
-</div>
-<div class="output_area">
-<div class="output_subarea output_text output_error">
-<pre>
-<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
-<span class="ansi-red-fg">KeyboardInterrupt</span>                         Traceback (most recent call last)
-Cell <span class="ansi-green-fg">In[11], line 27</span>
-<span class="ansi-green-intense-fg ansi-bold">     24</span> client <span style="color: rgb(98,98,98)">=</span> chromadb<span style="color: rgb(98,98,98)">.</span>PersistentClient(path<span style="color: rgb(98,98,98)">=</span>config[<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">persist_dir</span><span style="color: rgb(175,0,0)">"</span>])
-<span class="ansi-green-intense-fg ansi-bold">     26</span> <span style="color: rgb(95,135,135)"># Run "training"</span>
-<span class="ansi-green-fg">---&gt; 27</span> qa_dataset <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">setup_vector_db_and_qa</span><span class="ansi-yellow-bg">(</span>
-<span class="ansi-green-intense-fg ansi-bold">     28</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">config</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">config</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">data_type</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">config</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">type_of_data</span><span class="ansi-yellow-bg" style="color: rgb(175,0,0)">"</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">client</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">client</span>
-<span class="ansi-green-intense-fg ansi-bold">     29</span> <span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">     31</span> <span style="color: rgb(95,135,135)"># Run an evaluation by aggregating multiple queries and counting the results</span>
-<span class="ansi-green-intense-fg ansi-bold">     32</span> <span style="color: rgb(95,135,135)"># TODO : Replace this evaluation with a more meaningful one</span>
-<span class="ansi-green-intense-fg ansi-bold">     33</span> combined_df <span style="color: rgb(98,98,98)">=</span> aggregate_multiple_queries_and_count(queries,qa_dataset<span style="color: rgb(98,98,98)">=</span>qa_dataset, config<span style="color: rgb(98,98,98)">=</span>config, group_cols <span style="color: rgb(98,98,98)">=</span> [<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">id</span><span style="color: rgb(175,0,0)">"</span>, <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">name</span><span style="color: rgb(175,0,0)">"</span>], sort_by<span style="color: rgb(98,98,98)">=</span><span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">query</span><span style="color: rgb(175,0,0)">"</span>, count <span style="color: rgb(98,98,98)">=</span> <span class="ansi-bold" style="color: rgb(0,135,0)">True</span>)
-
-File <span class="ansi-green-fg">~/Documents/CODE/Github/ai_search/docs/developer tutorials/../../backend/modules/llm.py:268</span>, in <span class="ansi-cyan-fg">setup_vector_db_and_qa</span><span class="ansi-blue-fg">(config, data_type, client)</span>
-<span class="ansi-green-intense-fg ansi-bold">    264</span> metadata_df, all_metadata <span style="color: rgb(98,98,98)">=</span> create_metadata_dataframe(
-<span class="ansi-green-intense-fg ansi-bold">    265</span>     openml_data_object, data_id, all_metadata, config<span style="color: rgb(98,98,98)">=</span>config
-<span class="ansi-green-intense-fg ansi-bold">    266</span> )
-<span class="ansi-green-intense-fg ansi-bold">    267</span> <span style="color: rgb(95,135,135)"># Create the vector store</span>
-<span class="ansi-green-fg">--&gt; 268</span> vectordb <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">load_document_and_create_vector_store</span><span class="ansi-yellow-bg">(</span>
-<span class="ansi-green-intense-fg ansi-bold">    269</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">metadata_df</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">config</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">config</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">chroma_client</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">client</span>
-<span class="ansi-green-intense-fg ansi-bold">    270</span> <span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">    271</span> <span style="color: rgb(95,135,135)"># Initialize the LLM chain and setup Retrieval QA</span>
-<span class="ansi-green-intense-fg ansi-bold">    272</span> qa <span style="color: rgb(98,98,98)">=</span> initialize_llm_chain(vectordb<span style="color: rgb(98,98,98)">=</span>vectordb, config<span style="color: rgb(98,98,98)">=</span>config)
-
-File <span class="ansi-green-fg">~/Documents/CODE/Github/ai_search/docs/developer tutorials/../../backend/modules/llm.py:107</span>, in <span class="ansi-cyan-fg">load_document_and_create_vector_store</span><span class="ansi-blue-fg">(metadata_df, chroma_client, config)</span>
-<span class="ansi-green-intense-fg ansi-bold">    104</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> config[<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">training</span><span style="color: rgb(175,0,0)">"</span>]:
-<span class="ansi-green-intense-fg ansi-bold">    105</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> load_vector_store(chroma_client, config, embeddings, collection_name)
-<span class="ansi-green-fg">--&gt; 107</span> <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">create_vector_store</span><span class="ansi-yellow-bg">(</span>
-<span class="ansi-green-intense-fg ansi-bold">    108</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">metadata_df</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">chroma_client</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">config</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">embeddings</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">collection_name</span>
-<span class="ansi-green-intense-fg ansi-bold">    109</span> <span class="ansi-yellow-bg">)</span>
-
-File <span class="ansi-green-fg">~/Documents/CODE/Github/ai_search/docs/developer tutorials/../../backend/modules/llm.py:222</span>, in <span class="ansi-cyan-fg">create_vector_store</span><span class="ansi-blue-fg">(metadata_df, chroma_client, config, embeddings, collection_name)</span>
-<span class="ansi-green-intense-fg ansi-bold">    219</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> db
-<span class="ansi-green-intense-fg ansi-bold">    220</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
-<span class="ansi-green-intense-fg ansi-bold">    221</span>     <span style="color: rgb(95,135,135)"># db.add_documents(unique_docs, ids=unique_ids)</span>
-<span class="ansi-green-fg">--&gt; 222</span>     <span class="ansi-yellow-bg">add_documents_to_db</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">db</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">unique_docs</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">unique_ids</span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">    224</span> <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> db
-
-File <span class="ansi-green-fg">~/Documents/CODE/Github/ai_search/docs/developer tutorials/../../backend/modules/llm.py:180</span>, in <span class="ansi-cyan-fg">add_documents_to_db</span><span class="ansi-blue-fg">(db, unique_docs, unique_ids)</span>
-<span class="ansi-green-intense-fg ansi-bold">    178</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
-<span class="ansi-green-intense-fg ansi-bold">    179</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">for</span> i <span class="ansi-bold" style="color: rgb(175,0,255)">in</span> tqdm(<span style="color: rgb(0,135,0)">range</span>(<span style="color: rgb(98,98,98)">0</span>, <span style="color: rgb(0,135,0)">len</span>(unique_docs), bs)):
-<span class="ansi-green-fg">--&gt; 180</span>         <span class="ansi-yellow-bg">db</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">add_documents</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">unique_docs</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg">i</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">:</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">i</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">+</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">bs</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">ids</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">unique_ids</span><span class="ansi-yellow-bg">[</span><span class="ansi-yellow-bg">i</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">:</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">i</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">+</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">bs</span><span class="ansi-yellow-bg">]</span><span class="ansi-yellow-bg">)</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/langchain_core/vectorstores.py:147</span>, in <span class="ansi-cyan-fg">VectorStore.add_documents</span><span class="ansi-blue-fg">(self, documents, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">    145</span> texts <span style="color: rgb(98,98,98)">=</span> [doc<span style="color: rgb(98,98,98)">.</span>page_content <span class="ansi-bold" style="color: rgb(0,135,0)">for</span> doc <span class="ansi-bold" style="color: rgb(175,0,255)">in</span> documents]
-<span class="ansi-green-intense-fg ansi-bold">    146</span> metadatas <span style="color: rgb(98,98,98)">=</span> [doc<span style="color: rgb(98,98,98)">.</span>metadata <span class="ansi-bold" style="color: rgb(0,135,0)">for</span> doc <span class="ansi-bold" style="color: rgb(175,0,255)">in</span> documents]
-<span class="ansi-green-fg">--&gt; 147</span> <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">add_texts</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">texts</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">metadatas</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">kwargs</span><span class="ansi-yellow-bg">)</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/langchain_community/vectorstores/chroma.py:276</span>, in <span class="ansi-cyan-fg">Chroma.add_texts</span><span class="ansi-blue-fg">(self, texts, metadatas, ids, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">    274</span> texts <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">list</span>(texts)
-<span class="ansi-green-intense-fg ansi-bold">    275</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_embedding_function <span class="ansi-bold" style="color: rgb(175,0,255)">is</span> <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span>:
-<span class="ansi-green-fg">--&gt; 276</span>     embeddings <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_embedding_function</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">embed_documents</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">texts</span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">    277</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> metadatas:
-<span class="ansi-green-intense-fg ansi-bold">    278</span>     <span style="color: rgb(95,135,135)"># fill metadatas with empty dicts if somebody</span>
-<span class="ansi-green-intense-fg ansi-bold">    279</span>     <span style="color: rgb(95,135,135)"># did not specify metadata for all texts</span>
-<span class="ansi-green-intense-fg ansi-bold">    280</span>     length_diff <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">len</span>(texts) <span style="color: rgb(98,98,98)">-</span> <span style="color: rgb(0,135,0)">len</span>(metadatas)
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/langchain_community/embeddings/huggingface.py:98</span>, in <span class="ansi-cyan-fg">HuggingFaceEmbeddings.embed_documents</span><span class="ansi-blue-fg">(self, texts)</span>
-<span class="ansi-green-intense-fg ansi-bold">     96</span>     sentence_transformers<span style="color: rgb(98,98,98)">.</span>SentenceTransformer<span style="color: rgb(98,98,98)">.</span>stop_multi_process_pool(pool)
-<span class="ansi-green-intense-fg ansi-bold">     97</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
-<span class="ansi-green-fg">---&gt; 98</span>     embeddings <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">client</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">encode</span><span class="ansi-yellow-bg">(</span>
-<span class="ansi-green-intense-fg ansi-bold">     99</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">texts</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">show_progress_bar</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">show_progress</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">encode_kwargs</span>
-<span class="ansi-green-intense-fg ansi-bold">    100</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">    102</span> <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> embeddings<span style="color: rgb(98,98,98)">.</span>tolist()
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/sentence_transformers/SentenceTransformer.py:371</span>, in <span class="ansi-cyan-fg">SentenceTransformer.encode</span><span class="ansi-blue-fg">(self, sentences, prompt_name, prompt, batch_size, show_progress_bar, output_value, precision, convert_to_numpy, convert_to_tensor, device, normalize_embeddings)</span>
-<span class="ansi-green-intense-fg ansi-bold">    368</span> features<span style="color: rgb(98,98,98)">.</span>update(extra_features)
-<span class="ansi-green-intense-fg ansi-bold">    370</span> <span class="ansi-bold" style="color: rgb(0,135,0)">with</span> torch<span style="color: rgb(98,98,98)">.</span>no_grad():
-<span class="ansi-green-fg">--&gt; 371</span>     out_features <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">forward</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">features</span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">    372</span>     out_features[<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">sentence_embedding</span><span style="color: rgb(175,0,0)">"</span>] <span style="color: rgb(98,98,98)">=</span> truncate_embeddings(
-<span class="ansi-green-intense-fg ansi-bold">    373</span>         out_features[<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">sentence_embedding</span><span style="color: rgb(175,0,0)">"</span>], <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>truncate_dim
-<span class="ansi-green-intense-fg ansi-bold">    374</span>     )
-<span class="ansi-green-intense-fg ansi-bold">    376</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> output_value <span style="color: rgb(98,98,98)">==</span> <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">token_embeddings</span><span style="color: rgb(175,0,0)">"</span>:
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/container.py:217</span>, in <span class="ansi-cyan-fg">Sequential.forward</span><span class="ansi-blue-fg">(self, input)</span>
-<span class="ansi-green-intense-fg ansi-bold">    215</span> <span class="ansi-bold" style="color: rgb(0,135,0)">def</span> <span style="color: rgb(0,0,255)">forward</span>(<span style="color: rgb(0,135,0)">self</span>, <span style="color: rgb(0,135,0)">input</span>):
-<span class="ansi-green-intense-fg ansi-bold">    216</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">for</span> module <span class="ansi-bold" style="color: rgb(175,0,255)">in</span> <span style="color: rgb(0,135,0)">self</span>:
-<span class="ansi-green-fg">--&gt; 217</span>         <span style="color: rgb(0,135,0)">input</span> <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">module</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">input</span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">    218</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span style="color: rgb(0,135,0)">input</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1532</span>, in <span class="ansi-cyan-fg">Module._wrapped_call_impl</span><span class="ansi-blue-fg">(self, *args, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1530</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_compiled_call_impl(<span style="color: rgb(98,98,98)">*</span>args, <span style="color: rgb(98,98,98)">*</span><span style="color: rgb(98,98,98)">*</span>kwargs)  <span style="color: rgb(95,135,135)"># type: ignore[misc]</span>
-<span class="ansi-green-intense-fg ansi-bold">   1531</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
-<span class="ansi-green-fg">-&gt; 1532</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_call_impl</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">args</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">kwargs</span><span class="ansi-yellow-bg">)</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1541</span>, in <span class="ansi-cyan-fg">Module._call_impl</span><span class="ansi-blue-fg">(self, *args, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1536</span> <span style="color: rgb(95,135,135)"># If we don't have any hooks, we want to skip the rest of the logic in</span>
-<span class="ansi-green-intense-fg ansi-bold">   1537</span> <span style="color: rgb(95,135,135)"># this function, and just call forward.</span>
-<span class="ansi-green-intense-fg ansi-bold">   1538</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> (<span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_backward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_backward_pre_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_forward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_forward_pre_hooks
-<span class="ansi-green-intense-fg ansi-bold">   1539</span>         <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_backward_pre_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_backward_hooks
-<span class="ansi-green-intense-fg ansi-bold">   1540</span>         <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_forward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_forward_pre_hooks):
-<span class="ansi-green-fg">-&gt; 1541</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">forward_call</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">args</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">kwargs</span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1543</span> <span class="ansi-bold" style="color: rgb(0,135,0)">try</span>:
-<span class="ansi-green-intense-fg ansi-bold">   1544</span>     result <span style="color: rgb(98,98,98)">=</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/sentence_transformers/models/Transformer.py:98</span>, in <span class="ansi-cyan-fg">Transformer.forward</span><span class="ansi-blue-fg">(self, features)</span>
-<span class="ansi-green-intense-fg ansi-bold">     95</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">token_type_ids</span><span style="color: rgb(175,0,0)">"</span> <span class="ansi-bold" style="color: rgb(175,0,255)">in</span> features:
-<span class="ansi-green-intense-fg ansi-bold">     96</span>     trans_features[<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">token_type_ids</span><span style="color: rgb(175,0,0)">"</span>] <span style="color: rgb(98,98,98)">=</span> features[<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">token_type_ids</span><span style="color: rgb(175,0,0)">"</span>]
-<span class="ansi-green-fg">---&gt; 98</span> output_states <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">auto_model</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">trans_features</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">return_dict</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg ansi-bold" style="color: rgb(0,135,0)">False</span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">     99</span> output_tokens <span style="color: rgb(98,98,98)">=</span> output_states[<span style="color: rgb(98,98,98)">0</span>]
-<span class="ansi-green-intense-fg ansi-bold">    101</span> features<span style="color: rgb(98,98,98)">.</span>update({<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">token_embeddings</span><span style="color: rgb(175,0,0)">"</span>: output_tokens, <span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">attention_mask</span><span style="color: rgb(175,0,0)">"</span>: features[<span style="color: rgb(175,0,0)">"</span><span style="color: rgb(175,0,0)">attention_mask</span><span style="color: rgb(175,0,0)">"</span>]})
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1532</span>, in <span class="ansi-cyan-fg">Module._wrapped_call_impl</span><span class="ansi-blue-fg">(self, *args, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1530</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_compiled_call_impl(<span style="color: rgb(98,98,98)">*</span>args, <span style="color: rgb(98,98,98)">*</span><span style="color: rgb(98,98,98)">*</span>kwargs)  <span style="color: rgb(95,135,135)"># type: ignore[misc]</span>
-<span class="ansi-green-intense-fg ansi-bold">   1531</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
-<span class="ansi-green-fg">-&gt; 1532</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_call_impl</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">args</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">kwargs</span><span class="ansi-yellow-bg">)</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1541</span>, in <span class="ansi-cyan-fg">Module._call_impl</span><span class="ansi-blue-fg">(self, *args, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1536</span> <span style="color: rgb(95,135,135)"># If we don't have any hooks, we want to skip the rest of the logic in</span>
-<span class="ansi-green-intense-fg ansi-bold">   1537</span> <span style="color: rgb(95,135,135)"># this function, and just call forward.</span>
-<span class="ansi-green-intense-fg ansi-bold">   1538</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> (<span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_backward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_backward_pre_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_forward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_forward_pre_hooks
-<span class="ansi-green-intense-fg ansi-bold">   1539</span>         <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_backward_pre_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_backward_hooks
-<span class="ansi-green-intense-fg ansi-bold">   1540</span>         <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_forward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_forward_pre_hooks):
-<span class="ansi-green-fg">-&gt; 1541</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">forward_call</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">args</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">kwargs</span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1543</span> <span class="ansi-bold" style="color: rgb(0,135,0)">try</span>:
-<span class="ansi-green-intense-fg ansi-bold">   1544</span>     result <span style="color: rgb(98,98,98)">=</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py:1137</span>, in <span class="ansi-cyan-fg">BertModel.forward</span><span class="ansi-blue-fg">(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1130</span> <span style="color: rgb(95,135,135)"># Prepare head mask if needed</span>
-<span class="ansi-green-intense-fg ansi-bold">   1131</span> <span style="color: rgb(95,135,135)"># 1.0 in head_mask indicate we keep the head</span>
-<span class="ansi-green-intense-fg ansi-bold">   1132</span> <span style="color: rgb(95,135,135)"># attention_probs has shape bsz x n_heads x N x N</span>
-<span class="ansi-green-intense-fg ansi-bold">   1133</span> <span style="color: rgb(95,135,135)"># input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]</span>
-<span class="ansi-green-intense-fg ansi-bold">   1134</span> <span style="color: rgb(95,135,135)"># and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]</span>
-<span class="ansi-green-intense-fg ansi-bold">   1135</span> head_mask <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>get_head_mask(head_mask, <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>config<span style="color: rgb(98,98,98)">.</span>num_hidden_layers)
-<span class="ansi-green-fg">-&gt; 1137</span> encoder_outputs <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">encoder</span><span class="ansi-yellow-bg">(</span>
-<span class="ansi-green-intense-fg ansi-bold">   1138</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">embedding_output</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">   1139</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">attention_mask</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">extended_attention_mask</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">   1140</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">head_mask</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">head_mask</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">   1141</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">encoder_hidden_states</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">encoder_hidden_states</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">   1142</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">encoder_attention_mask</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">encoder_extended_attention_mask</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">   1143</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">past_key_values</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">past_key_values</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">   1144</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">use_cache</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">use_cache</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">   1145</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">output_attentions</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">output_attentions</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">   1146</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">output_hidden_states</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">output_hidden_states</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">   1147</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">return_dict</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">=</span><span class="ansi-yellow-bg">return_dict</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">   1148</span> <span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1149</span> sequence_output <span style="color: rgb(98,98,98)">=</span> encoder_outputs[<span style="color: rgb(98,98,98)">0</span>]
-<span class="ansi-green-intense-fg ansi-bold">   1150</span> pooled_output <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>pooler(sequence_output) <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>pooler <span class="ansi-bold" style="color: rgb(175,0,255)">is</span> <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1532</span>, in <span class="ansi-cyan-fg">Module._wrapped_call_impl</span><span class="ansi-blue-fg">(self, *args, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1530</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_compiled_call_impl(<span style="color: rgb(98,98,98)">*</span>args, <span style="color: rgb(98,98,98)">*</span><span style="color: rgb(98,98,98)">*</span>kwargs)  <span style="color: rgb(95,135,135)"># type: ignore[misc]</span>
-<span class="ansi-green-intense-fg ansi-bold">   1531</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
-<span class="ansi-green-fg">-&gt; 1532</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_call_impl</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">args</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">kwargs</span><span class="ansi-yellow-bg">)</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1541</span>, in <span class="ansi-cyan-fg">Module._call_impl</span><span class="ansi-blue-fg">(self, *args, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1536</span> <span style="color: rgb(95,135,135)"># If we don't have any hooks, we want to skip the rest of the logic in</span>
-<span class="ansi-green-intense-fg ansi-bold">   1537</span> <span style="color: rgb(95,135,135)"># this function, and just call forward.</span>
-<span class="ansi-green-intense-fg ansi-bold">   1538</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> (<span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_backward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_backward_pre_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_forward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_forward_pre_hooks
-<span class="ansi-green-intense-fg ansi-bold">   1539</span>         <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_backward_pre_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_backward_hooks
-<span class="ansi-green-intense-fg ansi-bold">   1540</span>         <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_forward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_forward_pre_hooks):
-<span class="ansi-green-fg">-&gt; 1541</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">forward_call</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">args</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">kwargs</span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1543</span> <span class="ansi-bold" style="color: rgb(0,135,0)">try</span>:
-<span class="ansi-green-intense-fg ansi-bold">   1544</span>     result <span style="color: rgb(98,98,98)">=</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py:690</span>, in <span class="ansi-cyan-fg">BertEncoder.forward</span><span class="ansi-blue-fg">(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)</span>
-<span class="ansi-green-intense-fg ansi-bold">    679</span>     layer_outputs <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_gradient_checkpointing_func(
-<span class="ansi-green-intense-fg ansi-bold">    680</span>         layer_module<span style="color: rgb(98,98,98)">.</span><span style="color: rgb(0,0,255)">__call__</span>,
-<span class="ansi-green-intense-fg ansi-bold">    681</span>         hidden_states,
-<span class="ansi-green-fg">   (...)</span>
-<span class="ansi-green-intense-fg ansi-bold">    687</span>         output_attentions,
-<span class="ansi-green-intense-fg ansi-bold">    688</span>     )
-<span class="ansi-green-intense-fg ansi-bold">    689</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
-<span class="ansi-green-fg">--&gt; 690</span>     layer_outputs <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">layer_module</span><span class="ansi-yellow-bg">(</span>
-<span class="ansi-green-intense-fg ansi-bold">    691</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">hidden_states</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">    692</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">attention_mask</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">    693</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">layer_head_mask</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">    694</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">encoder_hidden_states</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">    695</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">encoder_attention_mask</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">    696</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">past_key_value</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">    697</span> <span class="ansi-yellow-bg">        </span><span class="ansi-yellow-bg">output_attentions</span><span class="ansi-yellow-bg">,</span>
-<span class="ansi-green-intense-fg ansi-bold">    698</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">    700</span> hidden_states <span style="color: rgb(98,98,98)">=</span> layer_outputs[<span style="color: rgb(98,98,98)">0</span>]
-<span class="ansi-green-intense-fg ansi-bold">    701</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> use_cache:
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1532</span>, in <span class="ansi-cyan-fg">Module._wrapped_call_impl</span><span class="ansi-blue-fg">(self, *args, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1530</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_compiled_call_impl(<span style="color: rgb(98,98,98)">*</span>args, <span style="color: rgb(98,98,98)">*</span><span style="color: rgb(98,98,98)">*</span>kwargs)  <span style="color: rgb(95,135,135)"># type: ignore[misc]</span>
-<span class="ansi-green-intense-fg ansi-bold">   1531</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
-<span class="ansi-green-fg">-&gt; 1532</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_call_impl</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">args</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">kwargs</span><span class="ansi-yellow-bg">)</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1541</span>, in <span class="ansi-cyan-fg">Module._call_impl</span><span class="ansi-blue-fg">(self, *args, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1536</span> <span style="color: rgb(95,135,135)"># If we don't have any hooks, we want to skip the rest of the logic in</span>
-<span class="ansi-green-intense-fg ansi-bold">   1537</span> <span style="color: rgb(95,135,135)"># this function, and just call forward.</span>
-<span class="ansi-green-intense-fg ansi-bold">   1538</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> (<span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_backward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_backward_pre_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_forward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_forward_pre_hooks
-<span class="ansi-green-intense-fg ansi-bold">   1539</span>         <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_backward_pre_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_backward_hooks
-<span class="ansi-green-intense-fg ansi-bold">   1540</span>         <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_forward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_forward_pre_hooks):
-<span class="ansi-green-fg">-&gt; 1541</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">forward_call</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">args</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">kwargs</span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1543</span> <span class="ansi-bold" style="color: rgb(0,135,0)">try</span>:
-<span class="ansi-green-intense-fg ansi-bold">   1544</span>     result <span style="color: rgb(98,98,98)">=</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py:622</span>, in <span class="ansi-cyan-fg">BertLayer.forward</span><span class="ansi-blue-fg">(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)</span>
-<span class="ansi-green-intense-fg ansi-bold">    619</span>     cross_attn_present_key_value <span style="color: rgb(98,98,98)">=</span> cross_attention_outputs[<span style="color: rgb(98,98,98)">-</span><span style="color: rgb(98,98,98)">1</span>]
-<span class="ansi-green-intense-fg ansi-bold">    620</span>     present_key_value <span style="color: rgb(98,98,98)">=</span> present_key_value <span style="color: rgb(98,98,98)">+</span> cross_attn_present_key_value
-<span class="ansi-green-fg">--&gt; 622</span> layer_output <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg">apply_chunking_to_forward</span><span class="ansi-yellow-bg">(</span>
-<span class="ansi-green-intense-fg ansi-bold">    623</span> <span class="ansi-yellow-bg">    </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">feed_forward_chunk</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">chunk_size_feed_forward</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">seq_len_dim</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">attention_output</span>
-<span class="ansi-green-intense-fg ansi-bold">    624</span> <span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">    625</span> outputs <span style="color: rgb(98,98,98)">=</span> (layer_output,) <span style="color: rgb(98,98,98)">+</span> outputs
-<span class="ansi-green-intense-fg ansi-bold">    627</span> <span style="color: rgb(95,135,135)"># if decoder, return the attn key/values as the last output</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/transformers/pytorch_utils.py:238</span>, in <span class="ansi-cyan-fg">apply_chunking_to_forward</span><span class="ansi-blue-fg">(forward_fn, chunk_size, chunk_dim, *input_tensors)</span>
-<span class="ansi-green-intense-fg ansi-bold">    235</span>     <span style="color: rgb(95,135,135)"># concatenate output at same dimension</span>
-<span class="ansi-green-intense-fg ansi-bold">    236</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> torch<span style="color: rgb(98,98,98)">.</span>cat(output_chunks, dim<span style="color: rgb(98,98,98)">=</span>chunk_dim)
-<span class="ansi-green-fg">--&gt; 238</span> <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">forward_fn</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">input_tensors</span><span class="ansi-yellow-bg">)</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py:635</span>, in <span class="ansi-cyan-fg">BertLayer.feed_forward_chunk</span><span class="ansi-blue-fg">(self, attention_output)</span>
-<span class="ansi-green-intense-fg ansi-bold">    633</span> <span class="ansi-bold" style="color: rgb(0,135,0)">def</span> <span style="color: rgb(0,0,255)">feed_forward_chunk</span>(<span style="color: rgb(0,135,0)">self</span>, attention_output):
-<span class="ansi-green-intense-fg ansi-bold">    634</span>     intermediate_output <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>intermediate(attention_output)
-<span class="ansi-green-fg">--&gt; 635</span>     layer_output <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">output</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">intermediate_output</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg">attention_output</span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">    636</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> layer_output
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1532</span>, in <span class="ansi-cyan-fg">Module._wrapped_call_impl</span><span class="ansi-blue-fg">(self, *args, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1530</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_compiled_call_impl(<span style="color: rgb(98,98,98)">*</span>args, <span style="color: rgb(98,98,98)">*</span><span style="color: rgb(98,98,98)">*</span>kwargs)  <span style="color: rgb(95,135,135)"># type: ignore[misc]</span>
-<span class="ansi-green-intense-fg ansi-bold">   1531</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
-<span class="ansi-green-fg">-&gt; 1532</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_call_impl</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">args</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">kwargs</span><span class="ansi-yellow-bg">)</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1541</span>, in <span class="ansi-cyan-fg">Module._call_impl</span><span class="ansi-blue-fg">(self, *args, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1536</span> <span style="color: rgb(95,135,135)"># If we don't have any hooks, we want to skip the rest of the logic in</span>
-<span class="ansi-green-intense-fg ansi-bold">   1537</span> <span style="color: rgb(95,135,135)"># this function, and just call forward.</span>
-<span class="ansi-green-intense-fg ansi-bold">   1538</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> (<span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_backward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_backward_pre_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_forward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_forward_pre_hooks
-<span class="ansi-green-intense-fg ansi-bold">   1539</span>         <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_backward_pre_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_backward_hooks
-<span class="ansi-green-intense-fg ansi-bold">   1540</span>         <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_forward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_forward_pre_hooks):
-<span class="ansi-green-fg">-&gt; 1541</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">forward_call</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">args</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">kwargs</span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1543</span> <span class="ansi-bold" style="color: rgb(0,135,0)">try</span>:
-<span class="ansi-green-intense-fg ansi-bold">   1544</span>     result <span style="color: rgb(98,98,98)">=</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py:547</span>, in <span class="ansi-cyan-fg">BertOutput.forward</span><span class="ansi-blue-fg">(self, hidden_states, input_tensor)</span>
-<span class="ansi-green-intense-fg ansi-bold">    546</span> <span class="ansi-bold" style="color: rgb(0,135,0)">def</span> <span style="color: rgb(0,0,255)">forward</span>(<span style="color: rgb(0,135,0)">self</span>, hidden_states: torch<span style="color: rgb(98,98,98)">.</span>Tensor, input_tensor: torch<span style="color: rgb(98,98,98)">.</span>Tensor) <span style="color: rgb(98,98,98)">-</span><span style="color: rgb(98,98,98)">&gt;</span> torch<span style="color: rgb(98,98,98)">.</span>Tensor:
-<span class="ansi-green-fg">--&gt; 547</span>     hidden_states <span style="color: rgb(98,98,98)">=</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">dense</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg">hidden_states</span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">    548</span>     hidden_states <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>dropout(hidden_states)
-<span class="ansi-green-intense-fg ansi-bold">    549</span>     hidden_states <span style="color: rgb(98,98,98)">=</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>LayerNorm(hidden_states <span style="color: rgb(98,98,98)">+</span> input_tensor)
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1532</span>, in <span class="ansi-cyan-fg">Module._wrapped_call_impl</span><span class="ansi-blue-fg">(self, *args, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1530</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_compiled_call_impl(<span style="color: rgb(98,98,98)">*</span>args, <span style="color: rgb(98,98,98)">*</span><span style="color: rgb(98,98,98)">*</span>kwargs)  <span style="color: rgb(95,135,135)"># type: ignore[misc]</span>
-<span class="ansi-green-intense-fg ansi-bold">   1531</span> <span class="ansi-bold" style="color: rgb(0,135,0)">else</span>:
-<span class="ansi-green-fg">-&gt; 1532</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">_call_impl</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">args</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">kwargs</span><span class="ansi-yellow-bg">)</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1541</span>, in <span class="ansi-cyan-fg">Module._call_impl</span><span class="ansi-blue-fg">(self, *args, **kwargs)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1536</span> <span style="color: rgb(95,135,135)"># If we don't have any hooks, we want to skip the rest of the logic in</span>
-<span class="ansi-green-intense-fg ansi-bold">   1537</span> <span style="color: rgb(95,135,135)"># this function, and just call forward.</span>
-<span class="ansi-green-intense-fg ansi-bold">   1538</span> <span class="ansi-bold" style="color: rgb(0,135,0)">if</span> <span class="ansi-bold" style="color: rgb(175,0,255)">not</span> (<span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_backward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_backward_pre_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_forward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> <span style="color: rgb(0,135,0)">self</span><span style="color: rgb(98,98,98)">.</span>_forward_pre_hooks
-<span class="ansi-green-intense-fg ansi-bold">   1539</span>         <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_backward_pre_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_backward_hooks
-<span class="ansi-green-intense-fg ansi-bold">   1540</span>         <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_forward_hooks <span class="ansi-bold" style="color: rgb(175,0,255)">or</span> _global_forward_pre_hooks):
-<span class="ansi-green-fg">-&gt; 1541</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">forward_call</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">args</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">*</span><span class="ansi-yellow-bg">kwargs</span><span class="ansi-yellow-bg">)</span>
-<span class="ansi-green-intense-fg ansi-bold">   1543</span> <span class="ansi-bold" style="color: rgb(0,135,0)">try</span>:
-<span class="ansi-green-intense-fg ansi-bold">   1544</span>     result <span style="color: rgb(98,98,98)">=</span> <span class="ansi-bold" style="color: rgb(0,135,0)">None</span>
-
-File <span class="ansi-green-fg">~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/linear.py:116</span>, in <span class="ansi-cyan-fg">Linear.forward</span><span class="ansi-blue-fg">(self, input)</span>
-<span class="ansi-green-intense-fg ansi-bold">    115</span> <span class="ansi-bold" style="color: rgb(0,135,0)">def</span> <span style="color: rgb(0,0,255)">forward</span>(<span style="color: rgb(0,135,0)">self</span>, <span style="color: rgb(0,135,0)">input</span>: Tensor) <span style="color: rgb(98,98,98)">-</span><span style="color: rgb(98,98,98)">&gt;</span> Tensor:
-<span class="ansi-green-fg">--&gt; 116</span>     <span class="ansi-bold" style="color: rgb(0,135,0)">return</span> <span class="ansi-yellow-bg">F</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">linear</span><span class="ansi-yellow-bg">(</span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">input</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">weight</span><span class="ansi-yellow-bg">,</span><span class="ansi-yellow-bg"> </span><span class="ansi-yellow-bg" style="color: rgb(0,135,0)">self</span><span class="ansi-yellow-bg" style="color: rgb(98,98,98)">.</span><span class="ansi-yellow-bg">bias</span><span class="ansi-yellow-bg">)</span>
-
-<span class="ansi-red-fg">KeyboardInterrupt</span>: </pre>
-</div>
 </div>
 </div>
+<div class="cell border-box-sizing code_cell rendered">
+<div class="input">
+
+<div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">1</span></pre></div></td><td class="code"><div><pre><span></span><code>
+</code></pre></div></td></tr></table></div>
+
 </div>
 </div>
 
diff --git a/site/docker/index.html b/site/docker/index.html
index 57a7733..13518f6 100644
--- a/site/docker/index.html
+++ b/site/docker/index.html
@@ -354,6 +354,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../testing/" class="md-nav__link">
         
@@ -402,10 +422,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -416,8 +436,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -591,10 +611,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -605,8 +625,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
diff --git a/site/index.html b/site/index.html
index 70f7ddf..3958346 100644
--- a/site/index.html
+++ b/site/index.html
@@ -376,6 +376,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="testing/" class="md-nav__link">
         
@@ -424,10 +444,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -438,8 +458,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -613,10 +633,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -627,8 +647,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
diff --git a/site/inference/index.html b/site/inference/index.html
index b9a8ed1..b853ed4 100644
--- a/site/inference/index.html
+++ b/site/inference/index.html
@@ -12,7 +12,7 @@
         <link rel="prev" href="../docker/">
       
       
-        <link rel="next" href="../testing/">
+        <link rel="next" href="../query_llm/">
       
       
       <link rel="icon" href="../assets/images/favicon.png">
@@ -320,6 +320,96 @@
     </span>
   </a>
   
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils" class="md-nav__link">
+    <span class="md-ellipsis">
+      ui_utils
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.display_results" class="md-nav__link">
+    <span class="md-ellipsis">
+      display_results
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.feedback_cb" class="md-nav__link">
+    <span class="md-ellipsis">
+      feedback_cb
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.fetch_llm_response" class="md-nav__link">
+    <span class="md-ellipsis">
+      fetch_llm_response
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.fetch_response" class="md-nav__link">
+    <span class="md-ellipsis">
+      fetch_response
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.filter_initial_response" class="md-nav__link">
+    <span class="md-ellipsis">
+      filter_initial_response
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.parse_and_update_response" class="md-nav__link">
+    <span class="md-ellipsis">
+      parse_and_update_response
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.parse_llm_response" class="md-nav__link">
+    <span class="md-ellipsis">
+      parse_llm_response
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.run_streamlit" class="md-nav__link">
+    <span class="md-ellipsis">
+      run_streamlit
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.update_subset_cols" class="md-nav__link">
+    <span class="md-ellipsis">
+      update_subset_cols
+    </span>
+  </a>
+  
 </li>
       
     </ul>
@@ -336,6 +426,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../testing/" class="md-nav__link">
         
@@ -384,10 +494,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -398,8 +508,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -573,10 +683,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -587,8 +697,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
@@ -727,6 +837,96 @@
     </span>
   </a>
   
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils" class="md-nav__link">
+    <span class="md-ellipsis">
+      ui_utils
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.display_results" class="md-nav__link">
+    <span class="md-ellipsis">
+      display_results
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.feedback_cb" class="md-nav__link">
+    <span class="md-ellipsis">
+      feedback_cb
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.fetch_llm_response" class="md-nav__link">
+    <span class="md-ellipsis">
+      fetch_llm_response
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.fetch_response" class="md-nav__link">
+    <span class="md-ellipsis">
+      fetch_response
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.filter_initial_response" class="md-nav__link">
+    <span class="md-ellipsis">
+      filter_initial_response
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.parse_and_update_response" class="md-nav__link">
+    <span class="md-ellipsis">
+      parse_and_update_response
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.parse_llm_response" class="md-nav__link">
+    <span class="md-ellipsis">
+      parse_llm_response
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.run_streamlit" class="md-nav__link">
+    <span class="md-ellipsis">
+      run_streamlit
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#ui_utils.update_subset_cols" class="md-nav__link">
+    <span class="md-ellipsis">
+      update_subset_cols
+    </span>
+  </a>
+  
 </li>
       
     </ul>
@@ -761,6 +961,560 @@ <h2 id="errors">Errors<a class="headerlink" href="#errors" title="Permanent link
 </ul>
 
 
+<div class="doc doc-object doc-module">
+
+
+
+<a id="ui_utils"></a>
+    <div class="doc doc-contents first">
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="ui_utils.display_results" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">display_results</span><span class="p">(</span><span class="n">initial_response</span><span class="p">)</span></code>
+
+<a href="#ui_utils.display_results" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Display the results in a DataFrame</p>
+<p>Input: initial_response (DataFrame)</p>
+<p>Returns: None</p>
+
+            <details class="quote">
+              <summary>Source code in <code>frontend/ui_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">139</span>
+<span class="normal">140</span>
+<span class="normal">141</span>
+<span class="normal">142</span>
+<span class="normal">143</span>
+<span class="normal">144</span>
+<span class="normal">145</span>
+<span class="normal">146</span>
+<span class="normal">147</span>
+<span class="normal">148</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">display_results</span><span class="p">(</span><span class="n">initial_response</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Display the results in a DataFrame</span>
+
+<span class="sd">    Input: initial_response (DataFrame)</span>
+
+<span class="sd">    Returns: None</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">st</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s2">&quot;Results:&quot;</span><span class="p">)</span>
+    <span class="n">st</span><span class="o">.</span><span class="n">dataframe</span><span class="p">(</span><span class="n">initial_response</span><span class="p">)</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="ui_utils.feedback_cb" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">feedback_cb</span><span class="p">()</span></code>
+
+<a href="#ui_utils.feedback_cb" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Callback function to save feedback to a file</p>
+<p>Input: None</p>
+<p>Returns: None</p>
+
+            <details class="quote">
+              <summary>Source code in <code>frontend/ui_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 9</span>
+<span class="normal">10</span>
+<span class="normal">11</span>
+<span class="normal">12</span>
+<span class="normal">13</span>
+<span class="normal">14</span>
+<span class="normal">15</span>
+<span class="normal">16</span>
+<span class="normal">17</span>
+<span class="normal">18</span>
+<span class="normal">19</span>
+<span class="normal">20</span>
+<span class="normal">21</span>
+<span class="normal">22</span>
+<span class="normal">23</span>
+<span class="normal">24</span>
+<span class="normal">25</span>
+<span class="normal">26</span>
+<span class="normal">27</span>
+<span class="normal">28</span>
+<span class="normal">29</span>
+<span class="normal">30</span>
+<span class="normal">31</span>
+<span class="normal">32</span>
+<span class="normal">33</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">feedback_cb</span><span class="p">():</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Callback function to save feedback to a file</span>
+
+<span class="sd">    Input: None</span>
+
+<span class="sd">    Returns: None</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">file_path</span> <span class="o">=</span> <span class="s2">&quot;feedback.json&quot;</span>
+
+    <span class="k">if</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">file_path</span><span class="p">):</span>
+        <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">file_path</span><span class="p">,</span> <span class="s2">&quot;r&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">file</span><span class="p">:</span>
+            <span class="k">try</span><span class="p">:</span>
+                <span class="n">data</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">file</span><span class="p">)</span>
+            <span class="k">except</span> <span class="n">json</span><span class="o">.</span><span class="n">JSONDecodeError</span><span class="p">:</span>
+                <span class="n">data</span> <span class="o">=</span> <span class="p">[]</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="n">data</span> <span class="o">=</span> <span class="p">[]</span>
+
+    <span class="c1"># Append new feedback</span>
+    <span class="n">data</span><span class="o">.</span><span class="n">append</span><span class="p">({</span><span class="s2">&quot;ss&quot;</span><span class="p">:</span> <span class="n">ss</span><span class="o">.</span><span class="n">fb_k</span><span class="p">,</span> <span class="s2">&quot;query&quot;</span><span class="p">:</span> <span class="n">ss</span><span class="o">.</span><span class="n">query</span><span class="p">})</span>
+
+    <span class="c1"># Write updated content back to the file</span>
+    <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">file_path</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">file</span><span class="p">:</span>
+        <span class="n">json</span><span class="o">.</span><span class="n">dump</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">file</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="ui_utils.fetch_llm_response" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">fetch_llm_response</span><span class="p">(</span><span class="n">query</span><span class="p">)</span></code>
+
+<a href="#ui_utils.fetch_llm_response" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Fetch the response from the LLM service</p>
+<p>Input: query (str)</p>
+<p>Returns: llm_response (dict)</p>
+
+            <details class="quote">
+              <summary>Source code in <code>frontend/ui_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">103</span>
+<span class="normal">104</span>
+<span class="normal">105</span>
+<span class="normal">106</span>
+<span class="normal">107</span>
+<span class="normal">108</span>
+<span class="normal">109</span>
+<span class="normal">110</span>
+<span class="normal">111</span>
+<span class="normal">112</span>
+<span class="normal">113</span>
+<span class="normal">114</span>
+<span class="normal">115</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fetch_llm_response</span><span class="p">(</span><span class="n">query</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Fetch the response from the LLM service</span>
+
+<span class="sd">    Input: query (str)</span>
+
+<span class="sd">    Returns: llm_response (dict)</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">try</span><span class="p">:</span>
+        <span class="n">llm_response</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;http://fastapi:8081/llmquery/</span><span class="si">{</span><span class="n">query</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">json</span><span class="p">()</span>
+    <span class="k">except</span><span class="p">:</span>
+        <span class="n">llm_response</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;http://0.0.0.0:8081/llmquery/</span><span class="si">{</span><span class="n">query</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">json</span><span class="p">()</span>
+    <span class="k">return</span> <span class="n">llm_response</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="ui_utils.fetch_response" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">fetch_response</span><span class="p">(</span><span class="n">query_type</span><span class="p">,</span> <span class="n">query</span><span class="p">)</span></code>
+
+<a href="#ui_utils.fetch_response" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Fetch the response from the FastAPI service</p>
+<p>Input: query_type (str), query (str)</p>
+<p>Returns: response (dict)</p>
+
+            <details class="quote">
+              <summary>Source code in <code>frontend/ui_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 83</span>
+<span class="normal"> 84</span>
+<span class="normal"> 85</span>
+<span class="normal"> 86</span>
+<span class="normal"> 87</span>
+<span class="normal"> 88</span>
+<span class="normal"> 89</span>
+<span class="normal"> 90</span>
+<span class="normal"> 91</span>
+<span class="normal"> 92</span>
+<span class="normal"> 93</span>
+<span class="normal"> 94</span>
+<span class="normal"> 95</span>
+<span class="normal"> 96</span>
+<span class="normal"> 97</span>
+<span class="normal"> 98</span>
+<span class="normal"> 99</span>
+<span class="normal">100</span>
+<span class="normal">101</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fetch_response</span><span class="p">(</span><span class="n">query_type</span><span class="p">,</span> <span class="n">query</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Fetch the response from the FastAPI service</span>
+
+<span class="sd">    Input: query_type (str), query (str)</span>
+
+<span class="sd">    Returns: response (dict)</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">try</span><span class="p">:</span>
+        <span class="n">response</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
+            <span class="sa">f</span><span class="s2">&quot;http://fastapi:8000/</span><span class="si">{</span><span class="n">query_type</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span><span class="si">}</span><span class="s2">/</span><span class="si">{</span><span class="n">query</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">,</span>
+            <span class="n">json</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;query&quot;</span><span class="p">:</span> <span class="n">query</span><span class="p">,</span> <span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="n">query_type</span><span class="o">.</span><span class="n">lower</span><span class="p">()},</span>
+        <span class="p">)</span><span class="o">.</span><span class="n">json</span><span class="p">()</span>
+    <span class="k">except</span><span class="p">:</span>
+        <span class="n">response</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
+            <span class="sa">f</span><span class="s2">&quot;http://0.0.0.0:8000/</span><span class="si">{</span><span class="n">query_type</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span><span class="si">}</span><span class="s2">/</span><span class="si">{</span><span class="n">query</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">,</span>
+            <span class="n">json</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;query&quot;</span><span class="p">:</span> <span class="n">query</span><span class="p">,</span> <span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="n">query_type</span><span class="o">.</span><span class="n">lower</span><span class="p">()},</span>
+        <span class="p">)</span><span class="o">.</span><span class="n">json</span><span class="p">()</span>
+    <span class="k">return</span> <span class="n">response</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="ui_utils.filter_initial_response" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">filter_initial_response</span><span class="p">(</span><span class="n">response</span><span class="p">,</span> <span class="n">classification</span><span class="p">)</span></code>
+
+<a href="#ui_utils.filter_initial_response" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Filter the initial response based on the classification</p>
+<p>Input: response (DataFrame), classification (str)</p>
+<p>Returns: response (DataFrame)</p>
+
+            <details class="quote">
+              <summary>Source code in <code>frontend/ui_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">67</span>
+<span class="normal">68</span>
+<span class="normal">69</span>
+<span class="normal">70</span>
+<span class="normal">71</span>
+<span class="normal">72</span>
+<span class="normal">73</span>
+<span class="normal">74</span>
+<span class="normal">75</span>
+<span class="normal">76</span>
+<span class="normal">77</span>
+<span class="normal">78</span>
+<span class="normal">79</span>
+<span class="normal">80</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">filter_initial_response</span><span class="p">(</span><span class="n">response</span><span class="p">,</span> <span class="n">classification</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Filter the initial response based on the classification</span>
+
+<span class="sd">    Input: response (DataFrame), classification (str)</span>
+
+<span class="sd">    Returns: response (DataFrame)</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">if</span> <span class="n">classification</span> <span class="o">!=</span> <span class="s2">&quot;none&quot;</span><span class="p">:</span>
+        <span class="k">if</span> <span class="s2">&quot;multi&quot;</span> <span class="ow">in</span> <span class="n">classification</span><span class="p">:</span>
+            <span class="n">response</span> <span class="o">=</span> <span class="n">response</span><span class="p">[</span><span class="n">response</span><span class="p">[</span><span class="s2">&quot;NumberOfClasses&quot;</span><span class="p">]</span> <span class="o">&gt;</span> <span class="mi">2</span><span class="p">]</span>
+        <span class="k">elif</span> <span class="s2">&quot;binary&quot;</span> <span class="ow">in</span> <span class="n">classification</span><span class="p">:</span>
+            <span class="n">response</span> <span class="o">=</span> <span class="n">response</span><span class="p">[</span><span class="n">response</span><span class="p">[</span><span class="s2">&quot;NumberOfClasses&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="mi">2</span><span class="p">]</span>
+    <span class="k">return</span> <span class="n">response</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="ui_utils.parse_and_update_response" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">parse_and_update_response</span><span class="p">(</span><span class="n">query_type</span><span class="p">,</span> <span class="n">response</span><span class="p">,</span> <span class="n">llm_response</span><span class="p">,</span> <span class="n">data_metadata</span><span class="p">,</span> <span class="n">flow_metadata</span><span class="p">)</span></code>
+
+<a href="#ui_utils.parse_and_update_response" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Parse and update the response based on the query type</p>
+<p>Input: query_type (str), response (dict), llm_response (dict), data_metadata (DataFrame), flow_metadata (DataFrame)</p>
+<p>Returns: initial_response (DataFrame)</p>
+
+            <details class="quote">
+              <summary>Source code in <code>frontend/ui_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">117</span>
+<span class="normal">118</span>
+<span class="normal">119</span>
+<span class="normal">120</span>
+<span class="normal">121</span>
+<span class="normal">122</span>
+<span class="normal">123</span>
+<span class="normal">124</span>
+<span class="normal">125</span>
+<span class="normal">126</span>
+<span class="normal">127</span>
+<span class="normal">128</span>
+<span class="normal">129</span>
+<span class="normal">130</span>
+<span class="normal">131</span>
+<span class="normal">132</span>
+<span class="normal">133</span>
+<span class="normal">134</span>
+<span class="normal">135</span>
+<span class="normal">136</span>
+<span class="normal">137</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">parse_and_update_response</span><span class="p">(</span><span class="n">query_type</span><span class="p">,</span> <span class="n">response</span><span class="p">,</span> <span class="n">llm_response</span><span class="p">,</span> <span class="n">data_metadata</span><span class="p">,</span> <span class="n">flow_metadata</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Parse and update the response based on the query type</span>
+
+<span class="sd">    Input: query_type (str), response (dict), llm_response (dict), data_metadata (DataFrame), flow_metadata (DataFrame)</span>
+
+<span class="sd">    Returns: initial_response (DataFrame)</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">if</span> <span class="n">query_type</span> <span class="o">==</span> <span class="s2">&quot;Dataset&quot;</span><span class="p">:</span>
+        <span class="n">initial_response</span> <span class="o">=</span> <span class="n">data_metadata</span><span class="p">[</span><span class="n">data_metadata</span><span class="p">[</span><span class="s2">&quot;did&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">isin</span><span class="p">(</span><span class="n">response</span><span class="p">[</span><span class="s2">&quot;initial_response&quot;</span><span class="p">])]</span>
+        <span class="n">subset_cols</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;did&quot;</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">]</span>
+        <span class="k">try</span><span class="p">:</span>
+            <span class="n">dataset_size</span><span class="p">,</span> <span class="n">dataset_missing</span><span class="p">,</span> <span class="n">dataset_classification</span><span class="p">,</span> <span class="n">dataset_sort</span> <span class="o">=</span> <span class="n">parse_llm_response</span><span class="p">(</span><span class="n">llm_response</span><span class="p">)</span>
+            <span class="n">subset_cols</span> <span class="o">=</span> <span class="n">update_subset_cols</span><span class="p">(</span><span class="n">dataset_size</span><span class="p">,</span> <span class="n">dataset_missing</span><span class="p">,</span> <span class="n">dataset_classification</span><span class="p">)</span>
+            <span class="n">initial_response</span> <span class="o">=</span> <span class="n">filter_initial_response</span><span class="p">(</span><span class="n">initial_response</span><span class="p">,</span> <span class="n">dataset_classification</span><span class="p">)</span>
+        <span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
+            <span class="n">st</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Error processing LLM response: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+        <span class="n">initial_response</span> <span class="o">=</span> <span class="n">initial_response</span><span class="p">[</span><span class="n">subset_cols</span><span class="p">]</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="n">initial_response</span> <span class="o">=</span> <span class="n">flow_metadata</span><span class="p">[</span><span class="n">flow_metadata</span><span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">isin</span><span class="p">(</span><span class="n">response</span><span class="p">[</span><span class="s2">&quot;initial_response&quot;</span><span class="p">])]</span>
+    <span class="k">return</span> <span class="n">initial_response</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="ui_utils.parse_llm_response" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">parse_llm_response</span><span class="p">(</span><span class="n">response</span><span class="p">)</span></code>
+
+<a href="#ui_utils.parse_llm_response" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Parse the answers from the LLM response</p>
+<p>Input: response (dict)</p>
+<p>Returns: size (str), missing (str), classification (str), sort (str)</p>
+
+            <details class="quote">
+              <summary>Source code in <code>frontend/ui_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">36</span>
+<span class="normal">37</span>
+<span class="normal">38</span>
+<span class="normal">39</span>
+<span class="normal">40</span>
+<span class="normal">41</span>
+<span class="normal">42</span>
+<span class="normal">43</span>
+<span class="normal">44</span>
+<span class="normal">45</span>
+<span class="normal">46</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">parse_llm_response</span><span class="p">(</span><span class="n">response</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Parse the answers from the LLM response</span>
+
+<span class="sd">    Input: response (dict)</span>
+
+<span class="sd">    Returns: size (str), missing (str), classification (str), sort (str)</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">size</span><span class="p">,</span> <span class="n">missing</span><span class="p">,</span> <span class="n">classification</span> <span class="o">=</span> <span class="n">response</span><span class="p">[</span><span class="s2">&quot;answers&quot;</span><span class="p">]</span>
+    <span class="n">size</span><span class="p">,</span> <span class="n">sort</span> <span class="o">=</span> <span class="n">size</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;,&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="s2">&quot;,&quot;</span> <span class="ow">in</span> <span class="n">size</span> <span class="k">else</span> <span class="p">(</span><span class="n">size</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">size</span><span class="p">,</span> <span class="n">missing</span><span class="p">,</span> <span class="n">classification</span><span class="p">,</span> <span class="n">sort</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="ui_utils.run_streamlit" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">run_streamlit</span><span class="p">()</span></code>
+
+<a href="#ui_utils.run_streamlit" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Run the Streamlit app</p>
+<p>Input: None</p>
+<p>Returns: None</p>
+
+            <details class="quote">
+              <summary>Source code in <code>frontend/ui_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">150</span>
+<span class="normal">151</span>
+<span class="normal">152</span>
+<span class="normal">153</span>
+<span class="normal">154</span>
+<span class="normal">155</span>
+<span class="normal">156</span>
+<span class="normal">157</span>
+<span class="normal">158</span>
+<span class="normal">159</span>
+<span class="normal">160</span>
+<span class="normal">161</span>
+<span class="normal">162</span>
+<span class="normal">163</span>
+<span class="normal">164</span>
+<span class="normal">165</span>
+<span class="normal">166</span>
+<span class="normal">167</span>
+<span class="normal">168</span>
+<span class="normal">169</span>
+<span class="normal">170</span>
+<span class="normal">171</span>
+<span class="normal">172</span>
+<span class="normal">173</span>
+<span class="normal">174</span>
+<span class="normal">175</span>
+<span class="normal">176</span>
+<span class="normal">177</span>
+<span class="normal">178</span>
+<span class="normal">179</span>
+<span class="normal">180</span>
+<span class="normal">181</span>
+<span class="normal">182</span>
+<span class="normal">183</span>
+<span class="normal">184</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">run_streamlit</span><span class="p">():</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Run the Streamlit app</span>
+
+<span class="sd">    Input: None</span>
+
+<span class="sd">    Returns: None</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">if</span> <span class="n">st</span><span class="o">.</span><span class="n">button</span><span class="p">(</span><span class="s2">&quot;Submit&quot;</span><span class="p">):</span>
+        <span class="k">with</span> <span class="n">st</span><span class="o">.</span><span class="n">spinner</span><span class="p">(</span><span class="s2">&quot;Waiting for results...&quot;</span><span class="p">):</span>
+            <span class="n">query_type</span> <span class="o">=</span> <span class="n">st</span><span class="o">.</span><span class="n">session_state</span><span class="p">[</span><span class="s1">&#39;query_type&#39;</span><span class="p">]</span>
+            <span class="n">query</span> <span class="o">=</span> <span class="n">st</span><span class="o">.</span><span class="n">session_state</span><span class="p">[</span><span class="s1">&#39;query&#39;</span><span class="p">]</span>
+            <span class="n">data_metadata</span> <span class="o">=</span> <span class="n">st</span><span class="o">.</span><span class="n">session_state</span><span class="p">[</span><span class="s1">&#39;data_metadata&#39;</span><span class="p">]</span>
+            <span class="n">flow_metadata</span> <span class="o">=</span> <span class="n">st</span><span class="o">.</span><span class="n">session_state</span><span class="p">[</span><span class="s1">&#39;flow_metadata&#39;</span><span class="p">]</span>
+
+            <span class="n">response</span> <span class="o">=</span> <span class="n">fetch_response</span><span class="p">(</span><span class="n">query_type</span><span class="p">,</span> <span class="n">query</span><span class="p">)</span>
+
+        <span class="k">if</span> <span class="n">response</span><span class="p">[</span><span class="s2">&quot;initial_response&quot;</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">if</span> <span class="n">query_type</span> <span class="o">==</span> <span class="s2">&quot;Dataset&quot;</span><span class="p">:</span>
+                <span class="k">with</span> <span class="n">st</span><span class="o">.</span><span class="n">spinner</span><span class="p">(</span><span class="s2">&quot;Using an LLM to find the most relevant information...&quot;</span><span class="p">):</span>
+                    <span class="n">llm_response</span> <span class="o">=</span> <span class="n">fetch_llm_response</span><span class="p">(</span><span class="n">query</span><span class="p">)</span>
+                    <span class="n">initial_response</span> <span class="o">=</span> <span class="n">parse_and_update_response</span><span class="p">(</span><span class="n">query_type</span><span class="p">,</span> <span class="n">response</span><span class="p">,</span> <span class="n">llm_response</span><span class="p">,</span> <span class="n">data_metadata</span><span class="p">,</span> <span class="n">flow_metadata</span><span class="p">)</span>
+            <span class="k">else</span><span class="p">:</span>
+                <span class="n">initial_response</span> <span class="o">=</span> <span class="n">parse_and_update_response</span><span class="p">(</span><span class="n">query_type</span><span class="p">,</span> <span class="n">response</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="n">data_metadata</span><span class="p">,</span> <span class="n">flow_metadata</span><span class="p">)</span>
+
+            <span class="n">display_results</span><span class="p">(</span><span class="n">initial_response</span><span class="p">)</span>
+
+        <span class="k">with</span> <span class="n">st</span><span class="o">.</span><span class="n">form</span><span class="p">(</span><span class="s2">&quot;fb_form&quot;</span><span class="p">):</span>
+            <span class="n">streamlit_feedback</span><span class="p">(</span>
+                <span class="n">feedback_type</span><span class="o">=</span><span class="s2">&quot;thumbs&quot;</span><span class="p">,</span>
+                <span class="n">align</span><span class="o">=</span><span class="s2">&quot;flex-start&quot;</span><span class="p">,</span>
+                <span class="n">key</span><span class="o">=</span><span class="s2">&quot;fb_k&quot;</span><span class="p">,</span>
+                <span class="n">optional_text_label</span><span class="o">=</span><span class="s2">&quot;[Optional] Please provide an explanation&quot;</span><span class="p">,</span>
+            <span class="p">)</span>
+            <span class="n">st</span><span class="o">.</span><span class="n">form_submit_button</span><span class="p">(</span><span class="s2">&quot;Save feedback&quot;</span><span class="p">,</span> <span class="n">on_click</span><span class="o">=</span><span class="n">feedback_cb</span><span class="p">)</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="ui_utils.update_subset_cols" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">update_subset_cols</span><span class="p">(</span><span class="n">size</span><span class="p">,</span> <span class="n">missing</span><span class="p">,</span> <span class="n">classification</span><span class="p">)</span></code>
+
+<a href="#ui_utils.update_subset_cols" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Update the subset columns based on LLM's response</p>
+<p>Input: size (str), missing (str), classification (str)</p>
+<p>Returns: cols (list)</p>
+
+            <details class="quote">
+              <summary>Source code in <code>frontend/ui_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">49</span>
+<span class="normal">50</span>
+<span class="normal">51</span>
+<span class="normal">52</span>
+<span class="normal">53</span>
+<span class="normal">54</span>
+<span class="normal">55</span>
+<span class="normal">56</span>
+<span class="normal">57</span>
+<span class="normal">58</span>
+<span class="normal">59</span>
+<span class="normal">60</span>
+<span class="normal">61</span>
+<span class="normal">62</span>
+<span class="normal">63</span>
+<span class="normal">64</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">update_subset_cols</span><span class="p">(</span><span class="n">size</span><span class="p">,</span> <span class="n">missing</span><span class="p">,</span> <span class="n">classification</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Update the subset columns based on LLM&#39;s response</span>
+
+<span class="sd">    Input: size (str), missing (str), classification (str)</span>
+
+<span class="sd">    Returns: cols (list)</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">cols</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;did&quot;</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">]</span>
+    <span class="k">if</span> <span class="n">size</span> <span class="o">==</span> <span class="s2">&quot;yes&quot;</span><span class="p">:</span>
+        <span class="n">cols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">&quot;NumberOfInstances&quot;</span><span class="p">)</span>
+    <span class="k">if</span> <span class="n">missing</span> <span class="o">==</span> <span class="s2">&quot;yes&quot;</span><span class="p">:</span>
+        <span class="n">cols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">&quot;NumberOfMissingValues&quot;</span><span class="p">)</span>
+    <span class="k">if</span> <span class="n">classification</span> <span class="o">!=</span> <span class="s2">&quot;none&quot;</span><span class="p">:</span>
+        <span class="n">cols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s2">&quot;NumberOfClasses&quot;</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">cols</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+
 
 
 
diff --git a/site/modules/general_utils/index.html b/site/modules/general_utils/index.html
index 58ebdfd..aba9765 100644
--- a/site/modules/general_utils/index.html
+++ b/site/modules/general_utils/index.html
@@ -278,6 +278,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../testing/" class="md-nav__link">
         
@@ -326,10 +346,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -340,8 +360,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -517,10 +537,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" checked>
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" checked>
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -531,8 +551,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="true">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
@@ -796,7 +816,8 @@ <h2 id="general_utils.find_device" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/general_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">10</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 9</span>
+<span class="normal">10</span>
 <span class="normal">11</span>
 <span class="normal">12</span>
 <span class="normal">13</span>
@@ -809,11 +830,7 @@ <h2 id="general_utils.find_device" class="doc doc-heading">
 <span class="normal">20</span>
 <span class="normal">21</span>
 <span class="normal">22</span>
-<span class="normal">23</span>
-<span class="normal">24</span>
-<span class="normal">25</span>
-<span class="normal">26</span>
-<span class="normal">27</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">find_device</span><span class="p">(</span><span class="n">training</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> <span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+<span class="normal">23</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">find_device</span><span class="p">(</span><span class="n">training</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Find the device to use for the pipeline. If cuda is available, use it. If not, check if MPS is available and use it. If not, use CPU.</span>
 
@@ -825,9 +842,6 @@ <h2 id="general_utils.find_device" class="doc doc-heading">
     <span class="k">if</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">():</span>
         <span class="k">return</span> <span class="s2">&quot;cuda&quot;</span>
     <span class="k">elif</span> <span class="n">torch</span><span class="o">.</span><span class="n">backends</span><span class="o">.</span><span class="n">mps</span><span class="o">.</span><span class="n">is_available</span><span class="p">():</span>
-        <span class="k">if</span> <span class="n">training</span> <span class="o">==</span> <span class="kc">False</span><span class="p">:</span>
-            <span class="c1"># loading metadata on mps for inference is quite slow. So disabling for now.</span>
-            <span class="k">return</span> <span class="s2">&quot;cpu&quot;</span>
         <span class="k">return</span> <span class="s2">&quot;mps&quot;</span>
     <span class="k">else</span><span class="p">:</span>
         <span class="k">return</span> <span class="s2">&quot;cpu&quot;</span>
@@ -855,7 +869,11 @@ <h2 id="general_utils.load_config_and_device" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/general_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">30</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">26</span>
+<span class="normal">27</span>
+<span class="normal">28</span>
+<span class="normal">29</span>
+<span class="normal">30</span>
 <span class="normal">31</span>
 <span class="normal">32</span>
 <span class="normal">33</span>
@@ -869,11 +887,7 @@ <h2 id="general_utils.load_config_and_device" class="doc doc-heading">
 <span class="normal">41</span>
 <span class="normal">42</span>
 <span class="normal">43</span>
-<span class="normal">44</span>
-<span class="normal">45</span>
-<span class="normal">46</span>
-<span class="normal">47</span>
-<span class="normal">48</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">load_config_and_device</span><span class="p">(</span><span class="n">config_file</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">training</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">dict</span><span class="p">:</span>
+<span class="normal">44</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">load_config_and_device</span><span class="p">(</span><span class="n">config_file</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">training</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">dict</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Load the config file and find the device to use for the pipeline.</span>
 
diff --git a/site/modules/llm_module/index.html b/site/modules/llm_module/index.html
index fec1dc6..b59d52e 100644
--- a/site/modules/llm_module/index.html
+++ b/site/modules/llm_module/index.html
@@ -278,6 +278,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../testing/" class="md-nav__link">
         
@@ -326,10 +346,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -340,8 +360,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -517,10 +537,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" checked>
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" checked>
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -531,8 +551,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="true">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
@@ -958,20 +978,20 @@ <h2 id="llm.add_documents_to_db" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/llm.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">167</span>
-<span class="normal">168</span>
-<span class="normal">169</span>
-<span class="normal">170</span>
-<span class="normal">171</span>
-<span class="normal">172</span>
-<span class="normal">173</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">173</span>
 <span class="normal">174</span>
 <span class="normal">175</span>
 <span class="normal">176</span>
 <span class="normal">177</span>
 <span class="normal">178</span>
 <span class="normal">179</span>
-<span class="normal">180</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">add_documents_to_db</span><span class="p">(</span><span class="n">db</span><span class="p">,</span> <span class="n">unique_docs</span><span class="p">,</span> <span class="n">unique_ids</span><span class="p">):</span>
+<span class="normal">180</span>
+<span class="normal">181</span>
+<span class="normal">182</span>
+<span class="normal">183</span>
+<span class="normal">184</span>
+<span class="normal">185</span>
+<span class="normal">186</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">add_documents_to_db</span><span class="p">(</span><span class="n">db</span><span class="p">,</span> <span class="n">unique_docs</span><span class="p">,</span> <span class="n">unique_ids</span><span class="p">):</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Add documents to the vector store in batches of 200.</span>
 
@@ -1008,13 +1028,7 @@ <h2 id="llm.create_vector_store" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/llm.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">186</span>
-<span class="normal">187</span>
-<span class="normal">188</span>
-<span class="normal">189</span>
-<span class="normal">190</span>
-<span class="normal">191</span>
-<span class="normal">192</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">192</span>
 <span class="normal">193</span>
 <span class="normal">194</span>
 <span class="normal">195</span>
@@ -1046,8 +1060,22 @@ <h2 id="llm.create_vector_store" class="doc doc-heading">
 <span class="normal">221</span>
 <span class="normal">222</span>
 <span class="normal">223</span>
-<span class="normal">224</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">create_vector_store</span><span class="p">(</span>
-    <span class="n">metadata_df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">chroma_client</span><span class="p">:</span><span class="n">ClientAPI</span><span class="p">,</span> <span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">embeddings</span><span class="p">:</span> <span class="n">HuggingFaceEmbeddings</span><span class="p">,</span> <span class="n">collection_name</span><span class="p">:</span> <span class="nb">str</span> 
+<span class="normal">224</span>
+<span class="normal">225</span>
+<span class="normal">226</span>
+<span class="normal">227</span>
+<span class="normal">228</span>
+<span class="normal">229</span>
+<span class="normal">230</span>
+<span class="normal">231</span>
+<span class="normal">232</span>
+<span class="normal">233</span>
+<span class="normal">234</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">create_vector_store</span><span class="p">(</span>
+    <span class="n">metadata_df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span>
+    <span class="n">chroma_client</span><span class="p">:</span> <span class="n">ClientAPI</span><span class="p">,</span>
+    <span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span>
+    <span class="n">embeddings</span><span class="p">:</span> <span class="n">HuggingFaceEmbeddings</span><span class="p">,</span>
+    <span class="n">collection_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
 <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Chroma</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Create the vector store using Chroma db. The documents are loaded and processed, unique documents are generated, and the documents are added to the vector store.</span>
@@ -1071,7 +1099,7 @@ <h2 id="llm.create_vector_store" class="doc doc-heading">
         <span class="c1"># subset the data for testing</span>
         <span class="k">if</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;test_subset_2000&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="kc">True</span><span class="p">:</span>
             <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;[INFO] Subsetting the data to 100 rows.&quot;</span><span class="p">)</span>
-            <span class="n">documents</span> <span class="o">=</span> <span class="n">documents</span><span class="p">[:</span><span class="mi">100</span><span class="p">]</span>
+            <span class="n">documents</span> <span class="o">=</span> <span class="n">documents</span><span class="p">[:</span><span class="mi">500</span><span class="p">]</span>
     <span class="n">unique_docs</span><span class="p">,</span> <span class="n">unique_ids</span> <span class="o">=</span> <span class="n">generate_unique_documents</span><span class="p">(</span><span class="n">documents</span><span class="p">,</span> <span class="n">db</span><span class="p">)</span>
 
     <span class="nb">print</span><span class="p">(</span>
@@ -1112,7 +1140,9 @@ <h2 id="llm.generate_unique_documents" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/llm.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">52</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">50</span>
+<span class="normal">51</span>
+<span class="normal">52</span>
 <span class="normal">53</span>
 <span class="normal">54</span>
 <span class="normal">55</span>
@@ -1141,9 +1171,7 @@ <h2 id="llm.generate_unique_documents" class="doc doc-heading">
 <span class="normal">78</span>
 <span class="normal">79</span>
 <span class="normal">80</span>
-<span class="normal">81</span>
-<span class="normal">82</span>
-<span class="normal">83</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">generate_unique_documents</span><span class="p">(</span><span class="n">documents</span><span class="p">:</span> <span class="nb">list</span><span class="p">,</span> <span class="n">db</span><span class="p">:</span> <span class="n">Chroma</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">tuple</span><span class="p">:</span>
+<span class="normal">81</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">generate_unique_documents</span><span class="p">(</span><span class="n">documents</span><span class="p">:</span> <span class="nb">list</span><span class="p">,</span> <span class="n">db</span><span class="p">:</span> <span class="n">Chroma</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">tuple</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Generate unique documents by removing duplicates. This is done by generating unique IDs for the documents and keeping only one of the duplicate IDs.</span>
 <span class="sd">        Source: https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist</span>
@@ -1163,16 +1191,16 @@ <h2 id="llm.generate_unique_documents" class="doc doc-heading">
 
     <span class="n">new_dids</span> <span class="o">=</span> <span class="n">new_document_ids</span> <span class="o">-</span> <span class="n">old_dids</span>
     <span class="n">documents</span> <span class="o">=</span> <span class="p">[</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">documents</span> <span class="k">if</span> <span class="nb">str</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">metadata</span><span class="p">[</span><span class="s2">&quot;did&quot;</span><span class="p">])</span> <span class="ow">in</span> <span class="n">new_dids</span><span class="p">]</span>
-    <span class="n">ids</span> <span class="o">=</span> <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">uuid</span><span class="o">.</span><span class="n">uuid5</span><span class="p">(</span><span class="n">uuid</span><span class="o">.</span><span class="n">NAMESPACE_DNS</span><span class="p">,</span><span class="n">doc</span><span class="o">.</span><span class="n">page_content</span><span class="p">))</span> <span class="k">for</span> <span class="n">doc</span> <span class="ow">in</span> <span class="n">documents</span><span class="p">]</span>
+    <span class="n">ids</span> <span class="o">=</span> <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">uuid</span><span class="o">.</span><span class="n">uuid5</span><span class="p">(</span><span class="n">uuid</span><span class="o">.</span><span class="n">NAMESPACE_DNS</span><span class="p">,</span> <span class="n">doc</span><span class="o">.</span><span class="n">page_content</span><span class="p">))</span> <span class="k">for</span> <span class="n">doc</span> <span class="ow">in</span> <span class="n">documents</span><span class="p">]</span>
 
     <span class="c1"># Remove duplicates based on document content (from new documents)</span>
     <span class="n">unique_ids</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">ids</span><span class="p">))</span>
     <span class="n">seen_ids</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
     <span class="n">unique_docs</span> <span class="o">=</span> <span class="p">[</span>
-            <span class="n">doc</span>
-            <span class="k">for</span> <span class="n">doc</span><span class="p">,</span> <span class="nb">id</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">documents</span><span class="p">,</span> <span class="n">ids</span><span class="p">)</span>
-            <span class="k">if</span> <span class="nb">id</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">seen_ids</span> <span class="ow">and</span> <span class="p">(</span><span class="n">seen_ids</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="nb">id</span><span class="p">)</span> <span class="ow">or</span> <span class="kc">True</span><span class="p">)</span>
-        <span class="p">]</span>
+        <span class="n">doc</span>
+        <span class="k">for</span> <span class="n">doc</span><span class="p">,</span> <span class="nb">id</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">documents</span><span class="p">,</span> <span class="n">ids</span><span class="p">)</span>
+        <span class="k">if</span> <span class="nb">id</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">seen_ids</span> <span class="ow">and</span> <span class="p">(</span><span class="n">seen_ids</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="nb">id</span><span class="p">)</span> <span class="ow">or</span> <span class="kc">True</span><span class="p">)</span>
+    <span class="p">]</span>
 
     <span class="k">return</span> <span class="n">unique_docs</span><span class="p">,</span> <span class="n">unique_ids</span>
 </code></pre></div></td></tr></table></div>
@@ -1242,26 +1270,24 @@ <h2 id="llm.get_llm_chain" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/llm.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">276</span>
-<span class="normal">277</span>
-<span class="normal">278</span>
-<span class="normal">279</span>
-<span class="normal">280</span>
-<span class="normal">281</span>
-<span class="normal">282</span>
-<span class="normal">283</span>
-<span class="normal">284</span>
-<span class="normal">285</span>
-<span class="normal">286</span>
-<span class="normal">287</span>
-<span class="normal">288</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">288</span>
 <span class="normal">289</span>
 <span class="normal">290</span>
 <span class="normal">291</span>
 <span class="normal">292</span>
 <span class="normal">293</span>
 <span class="normal">294</span>
-<span class="normal">295</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_llm_chain</span><span class="p">(</span><span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">local</span><span class="p">:</span><span class="nb">bool</span> <span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">LLMChain</span><span class="o">|</span><span class="nb">bool</span><span class="p">:</span>
+<span class="normal">295</span>
+<span class="normal">296</span>
+<span class="normal">297</span>
+<span class="normal">298</span>
+<span class="normal">299</span>
+<span class="normal">300</span>
+<span class="normal">301</span>
+<span class="normal">302</span>
+<span class="normal">303</span>
+<span class="normal">304</span>
+<span class="normal">305</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_llm_chain</span><span class="p">(</span><span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">local</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">LLMChain</span> <span class="o">|</span> <span class="nb">bool</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Get the LLM chain with the specified model and prompt template.</span>
 
@@ -1270,11 +1296,9 @@ <h2 id="llm.get_llm_chain" class="doc doc-heading">
 <span class="sd">    Returns: LLMChain</span>
 <span class="sd">    &quot;&quot;&quot;</span>
     <span class="n">base_url</span> <span class="o">=</span> <span class="s2">&quot;http://127.0.0.1:11434&quot;</span> <span class="k">if</span> <span class="n">local</span> <span class="k">else</span> <span class="s2">&quot;http://ollama:11434&quot;</span>
-    <span class="n">llm</span> <span class="o">=</span> <span class="n">Ollama</span><span class="p">(</span>
-        <span class="n">model</span> <span class="o">=</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;llm_model&quot;</span><span class="p">]</span> <span class="p">,</span> <span class="n">base_url</span> <span class="o">=</span> <span class="n">base_url</span>
-    <span class="p">)</span>  
+    <span class="n">llm</span> <span class="o">=</span> <span class="n">Ollama</span><span class="p">(</span><span class="n">model</span><span class="o">=</span><span class="n">config</span><span class="p">[</span><span class="s2">&quot;llm_model&quot;</span><span class="p">],</span> <span class="n">base_url</span><span class="o">=</span><span class="n">base_url</span><span class="p">)</span>
     <span class="c1"># llm = Ollama(</span>
-        <span class="c1"># model = config[&quot;llm_model&quot;]</span>
+    <span class="c1"># model = config[&quot;llm_model&quot;]</span>
     <span class="c1"># )</span>
     <span class="c1"># print(llm)</span>
     <span class="n">map_template</span> <span class="o">=</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;llm_prompt_template&quot;</span><span class="p">]</span>
@@ -1304,24 +1328,22 @@ <h2 id="llm.initialize_llm_chain" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/llm.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">230</span>
-<span class="normal">231</span>
-<span class="normal">232</span>
-<span class="normal">233</span>
-<span class="normal">234</span>
-<span class="normal">235</span>
-<span class="normal">236</span>
-<span class="normal">237</span>
-<span class="normal">238</span>
-<span class="normal">239</span>
-<span class="normal">240</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">240</span>
 <span class="normal">241</span>
 <span class="normal">242</span>
 <span class="normal">243</span>
 <span class="normal">244</span>
-<span class="normal">245</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">initialize_llm_chain</span><span class="p">(</span>
-    <span class="n">vectordb</span><span class="p">:</span> <span class="n">Chroma</span><span class="p">,</span>
-    <span class="n">config</span> <span class="p">:</span> <span class="nb">dict</span>
+<span class="normal">245</span>
+<span class="normal">246</span>
+<span class="normal">247</span>
+<span class="normal">248</span>
+<span class="normal">249</span>
+<span class="normal">250</span>
+<span class="normal">251</span>
+<span class="normal">252</span>
+<span class="normal">253</span>
+<span class="normal">254</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">initialize_llm_chain</span><span class="p">(</span>
+    <span class="n">vectordb</span><span class="p">:</span> <span class="n">Chroma</span><span class="p">,</span> <span class="n">config</span><span class="p">:</span> <span class="nb">dict</span>
 <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">langchain</span><span class="o">.</span><span class="n">chains</span><span class="o">.</span><span class="n">retrieval_qa</span><span class="o">.</span><span class="n">base</span><span class="o">.</span><span class="n">RetrievalQA</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Initialize the LLM chain and setup Retrieval QA with the specified configuration.</span>
@@ -1358,7 +1380,9 @@ <h2 id="llm.load_and_process_data" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/llm.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">33</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">31</span>
+<span class="normal">32</span>
+<span class="normal">33</span>
 <span class="normal">34</span>
 <span class="normal">35</span>
 <span class="normal">36</span>
@@ -1372,9 +1396,7 @@ <h2 id="llm.load_and_process_data" class="doc doc-heading">
 <span class="normal">44</span>
 <span class="normal">45</span>
 <span class="normal">46</span>
-<span class="normal">47</span>
-<span class="normal">48</span>
-<span class="normal">49</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">load_and_process_data</span><span class="p">(</span><span class="n">metadata_df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">page_content_column</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">list</span><span class="p">:</span>
+<span class="normal">47</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">load_and_process_data</span><span class="p">(</span><span class="n">metadata_df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">page_content_column</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">list</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Load and process the data for the vector store. Split the documents into chunks of 1000 characters.</span>
 
@@ -1494,7 +1516,9 @@ <h2 id="llm.load_document_and_create_vector_store" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/llm.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 87</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 85</span>
+<span class="normal"> 86</span>
+<span class="normal"> 87</span>
 <span class="normal"> 88</span>
 <span class="normal"> 89</span>
 <span class="normal"> 90</span>
@@ -1516,7 +1540,9 @@ <h2 id="llm.load_document_and_create_vector_store" class="doc doc-heading">
 <span class="normal">106</span>
 <span class="normal">107</span>
 <span class="normal">108</span>
-<span class="normal">109</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">load_document_and_create_vector_store</span><span class="p">(</span><span class="n">metadata_df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">chroma_client</span><span class="p">:</span><span class="n">ClientAPI</span> <span class="p">,</span> <span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Chroma</span><span class="p">:</span>
+<span class="normal">109</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">load_document_and_create_vector_store</span><span class="p">(</span>
+    <span class="n">metadata_df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">chroma_client</span><span class="p">:</span> <span class="n">ClientAPI</span><span class="p">,</span> <span class="n">config</span><span class="p">:</span> <span class="nb">dict</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Chroma</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Loads the documents and creates the vector store. If the training flag is set to True,</span>
 <span class="sd">    the documents are added to the vector store. If the training flag is set to False,</span>
@@ -1596,7 +1622,7 @@ <h2 id="llm.load_model" class="doc doc-heading">
         <span class="n">model_name</span><span class="o">=</span><span class="n">config</span><span class="p">[</span><span class="s2">&quot;embedding_model&quot;</span><span class="p">],</span>
         <span class="n">model_kwargs</span><span class="o">=</span><span class="n">model_kwargs</span><span class="p">,</span>
         <span class="n">encode_kwargs</span><span class="o">=</span><span class="n">encode_kwargs</span><span class="p">,</span>
-        <span class="n">show_progress</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+        <span class="n">show_progress</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
         <span class="c1"># trust_remote_code=True</span>
     <span class="p">)</span>
     <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;[INFO] Model loaded.&quot;</span><span class="p">)</span>
@@ -1642,7 +1668,17 @@ <h2 id="llm.load_vector_store" class="doc doc-heading">
 <span class="normal">162</span>
 <span class="normal">163</span>
 <span class="normal">164</span>
-<span class="normal">165</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">load_vector_store</span><span class="p">(</span><span class="n">chroma_client</span><span class="p">:</span> <span class="n">ClientAPI</span><span class="p">,</span> <span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">embeddings</span><span class="p">:</span> <span class="n">HuggingFaceEmbeddings</span><span class="p">,</span> <span class="n">collection_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Chroma</span><span class="p">:</span>
+<span class="normal">165</span>
+<span class="normal">166</span>
+<span class="normal">167</span>
+<span class="normal">168</span>
+<span class="normal">169</span>
+<span class="normal">170</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">load_vector_store</span><span class="p">(</span>
+    <span class="n">chroma_client</span><span class="p">:</span> <span class="n">ClientAPI</span><span class="p">,</span>
+    <span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span>
+    <span class="n">embeddings</span><span class="p">:</span> <span class="n">HuggingFaceEmbeddings</span><span class="p">,</span>
+    <span class="n">collection_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Chroma</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Load the vector store from the persist directory.</span>
 
@@ -1685,16 +1721,7 @@ <h2 id="llm.setup_vector_db_and_qa" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/llm.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">248</span>
-<span class="normal">249</span>
-<span class="normal">250</span>
-<span class="normal">251</span>
-<span class="normal">252</span>
-<span class="normal">253</span>
-<span class="normal">254</span>
-<span class="normal">255</span>
-<span class="normal">256</span>
-<span class="normal">257</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">257</span>
 <span class="normal">258</span>
 <span class="normal">259</span>
 <span class="normal">260</span>
@@ -1710,7 +1737,21 @@ <h2 id="llm.setup_vector_db_and_qa" class="doc doc-heading">
 <span class="normal">270</span>
 <span class="normal">271</span>
 <span class="normal">272</span>
-<span class="normal">273</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">setup_vector_db_and_qa</span><span class="p">(</span><span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">data_type</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">client</span><span class="p">:</span><span class="n">ClientAPI</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">langchain</span><span class="o">.</span><span class="n">chains</span><span class="o">.</span><span class="n">retrieval_qa</span><span class="o">.</span><span class="n">base</span><span class="o">.</span><span class="n">RetrievalQA</span><span class="p">:</span>
+<span class="normal">273</span>
+<span class="normal">274</span>
+<span class="normal">275</span>
+<span class="normal">276</span>
+<span class="normal">277</span>
+<span class="normal">278</span>
+<span class="normal">279</span>
+<span class="normal">280</span>
+<span class="normal">281</span>
+<span class="normal">282</span>
+<span class="normal">283</span>
+<span class="normal">284</span>
+<span class="normal">285</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">setup_vector_db_and_qa</span><span class="p">(</span>
+    <span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">data_type</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">client</span><span class="p">:</span> <span class="n">ClientAPI</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">langchain</span><span class="o">.</span><span class="n">chains</span><span class="o">.</span><span class="n">retrieval_qa</span><span class="o">.</span><span class="n">base</span><span class="o">.</span><span class="n">RetrievalQA</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Create the vector database using Chroma db with each type of data in its own collection. Doing so allows us to have a single database with multiple collections, reducing the number of databases we need to manage.</span>
 <span class="sd">    This also downloads the embedding model if it does not exist. The QA chain is then initialized with the vector store and the configuration.</span>
@@ -1721,13 +1762,14 @@ <h2 id="llm.setup_vector_db_and_qa" class="doc doc-heading">
 <span class="sd">    &quot;&quot;&quot;</span>
 
     <span class="n">config</span><span class="p">[</span><span class="s2">&quot;type_of_data&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">data_type</span>
+
     <span class="c1"># Download the data if it does not exist</span>
-    <span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_metadata</span> <span class="o">=</span> <span class="n">get_all_metadata_from_openml</span><span class="p">(</span>
+    <span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_metadata</span><span class="p">,</span> <span class="n">handler</span> <span class="o">=</span> <span class="n">get_all_metadata_from_openml</span><span class="p">(</span>
         <span class="n">config</span><span class="o">=</span><span class="n">config</span>
     <span class="p">)</span>
     <span class="c1"># Create the combined metadata dataframe</span>
     <span class="n">metadata_df</span><span class="p">,</span> <span class="n">all_metadata</span> <span class="o">=</span> <span class="n">create_metadata_dataframe</span><span class="p">(</span>
-        <span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_metadata</span><span class="p">,</span> <span class="n">config</span><span class="o">=</span><span class="n">config</span>
+        <span class="n">handler</span><span class="p">,</span> <span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_metadata</span><span class="p">,</span> <span class="n">config</span><span class="o">=</span><span class="n">config</span>
     <span class="p">)</span>
     <span class="c1"># Create the vector store</span>
     <span class="n">vectordb</span> <span class="o">=</span> <span class="n">load_document_and_create_vector_store</span><span class="p">(</span>
@@ -1735,7 +1777,7 @@ <h2 id="llm.setup_vector_db_and_qa" class="doc doc-heading">
     <span class="p">)</span>
     <span class="c1"># Initialize the LLM chain and setup Retrieval QA</span>
     <span class="n">qa</span> <span class="o">=</span> <span class="n">initialize_llm_chain</span><span class="p">(</span><span class="n">vectordb</span><span class="o">=</span><span class="n">vectordb</span><span class="p">,</span> <span class="n">config</span><span class="o">=</span><span class="n">config</span><span class="p">)</span>
-    <span class="k">return</span> <span class="n">qa</span>
+    <span class="k">return</span> <span class="n">qa</span><span class="p">,</span> <span class="n">all_metadata</span>
 </code></pre></div></td></tr></table></div>
             </details>
     </div>
diff --git a/site/modules/metadata_module/index.html b/site/modules/metadata_module/index.html
index 2a19b5d..d0d6f74 100644
--- a/site/modules/metadata_module/index.html
+++ b/site/modules/metadata_module/index.html
@@ -278,6 +278,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../testing/" class="md-nav__link">
         
@@ -326,10 +346,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -340,8 +360,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -517,10 +537,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" checked>
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" checked>
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -531,8 +551,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="true">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
@@ -638,135 +658,159 @@
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.combine_metadata" class="md-nav__link">
+  <a href="#metadata_utils.OpenMLDatasetHandler" class="md-nav__link">
     <span class="md-ellipsis">
-      combine_metadata
+      OpenMLDatasetHandler
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.create_combined_information_df" class="md-nav__link">
+  <a href="#metadata_utils.OpenMLFlowHandler" class="md-nav__link">
     <span class="md-ellipsis">
-      create_combined_information_df
+      OpenMLFlowHandler
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.create_metadata_dataframe" class="md-nav__link">
+  <a href="#metadata_utils.OpenMLObjectHandler" class="md-nav__link">
     <span class="md-ellipsis">
-      create_metadata_dataframe
+      OpenMLObjectHandler
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="OpenMLObjectHandler">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#metadata_utils.OpenMLObjectHandler.get_description" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_description
     </span>
   </a>
   
 </li>
-      
-        <li class="md-nav__item">
-  <a href="#metadata_utils.extract_attribute" class="md-nav__link">
+        
+          <li class="md-nav__item">
+  <a href="#metadata_utils.OpenMLObjectHandler.get_metadata" class="md-nav__link">
     <span class="md-ellipsis">
-      extract_attribute
+      get_metadata
     </span>
   </a>
   
 </li>
-      
-        <li class="md-nav__item">
-  <a href="#metadata_utils.get_all_metadata_from_openml" class="md-nav__link">
+        
+          <li class="md-nav__item">
+  <a href="#metadata_utils.OpenMLObjectHandler.get_openml_objects" class="md-nav__link">
     <span class="md-ellipsis">
-      get_all_metadata_from_openml
+      get_openml_objects
     </span>
   </a>
   
 </li>
-      
-        <li class="md-nav__item">
-  <a href="#metadata_utils.get_dataset_description" class="md-nav__link">
+        
+          <li class="md-nav__item">
+  <a href="#metadata_utils.OpenMLObjectHandler.initialize_cache" class="md-nav__link">
+    <span class="md-ellipsis">
+      initialize_cache
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#metadata_utils.OpenMLObjectHandler.load_metadata" class="md-nav__link">
     <span class="md-ellipsis">
-      get_dataset_description
+      load_metadata
     </span>
   </a>
   
 </li>
-      
-        <li class="md-nav__item">
-  <a href="#metadata_utils.get_flow_description" class="md-nav__link">
+        
+          <li class="md-nav__item">
+  <a href="#metadata_utils.OpenMLObjectHandler.process_metadata" class="md-nav__link">
     <span class="md-ellipsis">
-      get_flow_description
+      process_metadata
     </span>
   </a>
   
+</li>
+        
+      </ul>
+    </nav>
+  
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.get_metadata_from_openml" class="md-nav__link">
+  <a href="#metadata_utils.combine_metadata" class="md-nav__link">
     <span class="md-ellipsis">
-      get_metadata_from_openml
+      combine_metadata
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.get_openml_objects" class="md-nav__link">
+  <a href="#metadata_utils.create_combined_information_df" class="md-nav__link">
     <span class="md-ellipsis">
-      get_openml_objects
+      create_combined_information_df
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.initialize_cache" class="md-nav__link">
+  <a href="#metadata_utils.create_metadata_dataframe" class="md-nav__link">
     <span class="md-ellipsis">
-      initialize_cache
+      create_metadata_dataframe
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.join_attributes" class="md-nav__link">
+  <a href="#metadata_utils.extract_attribute" class="md-nav__link">
     <span class="md-ellipsis">
-      join_attributes
+      extract_attribute
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.load_metadata_from_file" class="md-nav__link">
+  <a href="#metadata_utils.get_all_metadata_from_openml" class="md-nav__link">
     <span class="md-ellipsis">
-      load_metadata_from_file
+      get_all_metadata_from_openml
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.merge_all_columns_to_string" class="md-nav__link">
+  <a href="#metadata_utils.join_attributes" class="md-nav__link">
     <span class="md-ellipsis">
-      merge_all_columns_to_string
+      join_attributes
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.process_dataset_metadata" class="md-nav__link">
+  <a href="#metadata_utils.load_metadata_from_file" class="md-nav__link">
     <span class="md-ellipsis">
-      process_dataset_metadata
+      load_metadata_from_file
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.process_flow_metadata" class="md-nav__link">
+  <a href="#metadata_utils.merge_all_columns_to_string" class="md-nav__link">
     <span class="md-ellipsis">
-      process_flow_metadata
+      merge_all_columns_to_string
     </span>
   </a>
   
@@ -852,135 +896,159 @@
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.combine_metadata" class="md-nav__link">
+  <a href="#metadata_utils.OpenMLDatasetHandler" class="md-nav__link">
     <span class="md-ellipsis">
-      combine_metadata
+      OpenMLDatasetHandler
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.create_combined_information_df" class="md-nav__link">
+  <a href="#metadata_utils.OpenMLFlowHandler" class="md-nav__link">
     <span class="md-ellipsis">
-      create_combined_information_df
+      OpenMLFlowHandler
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.create_metadata_dataframe" class="md-nav__link">
+  <a href="#metadata_utils.OpenMLObjectHandler" class="md-nav__link">
     <span class="md-ellipsis">
-      create_metadata_dataframe
+      OpenMLObjectHandler
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="OpenMLObjectHandler">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#metadata_utils.OpenMLObjectHandler.get_description" class="md-nav__link">
+    <span class="md-ellipsis">
+      get_description
     </span>
   </a>
   
 </li>
-      
-        <li class="md-nav__item">
-  <a href="#metadata_utils.extract_attribute" class="md-nav__link">
+        
+          <li class="md-nav__item">
+  <a href="#metadata_utils.OpenMLObjectHandler.get_metadata" class="md-nav__link">
     <span class="md-ellipsis">
-      extract_attribute
+      get_metadata
     </span>
   </a>
   
 </li>
-      
-        <li class="md-nav__item">
-  <a href="#metadata_utils.get_all_metadata_from_openml" class="md-nav__link">
+        
+          <li class="md-nav__item">
+  <a href="#metadata_utils.OpenMLObjectHandler.get_openml_objects" class="md-nav__link">
     <span class="md-ellipsis">
-      get_all_metadata_from_openml
+      get_openml_objects
     </span>
   </a>
   
 </li>
-      
-        <li class="md-nav__item">
-  <a href="#metadata_utils.get_dataset_description" class="md-nav__link">
+        
+          <li class="md-nav__item">
+  <a href="#metadata_utils.OpenMLObjectHandler.initialize_cache" class="md-nav__link">
     <span class="md-ellipsis">
-      get_dataset_description
+      initialize_cache
     </span>
   </a>
   
 </li>
-      
-        <li class="md-nav__item">
-  <a href="#metadata_utils.get_flow_description" class="md-nav__link">
+        
+          <li class="md-nav__item">
+  <a href="#metadata_utils.OpenMLObjectHandler.load_metadata" class="md-nav__link">
+    <span class="md-ellipsis">
+      load_metadata
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#metadata_utils.OpenMLObjectHandler.process_metadata" class="md-nav__link">
     <span class="md-ellipsis">
-      get_flow_description
+      process_metadata
     </span>
   </a>
   
+</li>
+        
+      </ul>
+    </nav>
+  
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.get_metadata_from_openml" class="md-nav__link">
+  <a href="#metadata_utils.combine_metadata" class="md-nav__link">
     <span class="md-ellipsis">
-      get_metadata_from_openml
+      combine_metadata
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.get_openml_objects" class="md-nav__link">
+  <a href="#metadata_utils.create_combined_information_df" class="md-nav__link">
     <span class="md-ellipsis">
-      get_openml_objects
+      create_combined_information_df
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.initialize_cache" class="md-nav__link">
+  <a href="#metadata_utils.create_metadata_dataframe" class="md-nav__link">
     <span class="md-ellipsis">
-      initialize_cache
+      create_metadata_dataframe
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.join_attributes" class="md-nav__link">
+  <a href="#metadata_utils.extract_attribute" class="md-nav__link">
     <span class="md-ellipsis">
-      join_attributes
+      extract_attribute
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.load_metadata_from_file" class="md-nav__link">
+  <a href="#metadata_utils.get_all_metadata_from_openml" class="md-nav__link">
     <span class="md-ellipsis">
-      load_metadata_from_file
+      get_all_metadata_from_openml
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.merge_all_columns_to_string" class="md-nav__link">
+  <a href="#metadata_utils.join_attributes" class="md-nav__link">
     <span class="md-ellipsis">
-      merge_all_columns_to_string
+      join_attributes
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.process_dataset_metadata" class="md-nav__link">
+  <a href="#metadata_utils.load_metadata_from_file" class="md-nav__link">
     <span class="md-ellipsis">
-      process_dataset_metadata
+      load_metadata_from_file
     </span>
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#metadata_utils.process_flow_metadata" class="md-nav__link">
+  <a href="#metadata_utils.merge_all_columns_to_string" class="md-nav__link">
     <span class="md-ellipsis">
-      process_flow_metadata
+      merge_all_columns_to_string
     </span>
   </a>
   
@@ -1030,96 +1098,192 @@ <h1>Metadata module</h1>
 
 
 
+<div class="doc doc-object doc-class">
 
-<div class="doc doc-object doc-function">
 
 
-<h2 id="metadata_utils.combine_metadata" class="doc doc-heading">
-            <code class="highlight language-python"><span class="n">combine_metadata</span><span class="p">(</span><span class="n">all_dataset_metadata</span><span class="p">,</span> <span class="n">all_data_description_df</span><span class="p">)</span></code>
+<h2 id="metadata_utils.OpenMLDatasetHandler" class="doc doc-heading">
+            <code>OpenMLDatasetHandler</code>
 
-<a href="#metadata_utils.combine_metadata" class="headerlink" title="Permanent link">&para;</a></h2>
+
+<a href="#metadata_utils.OpenMLDatasetHandler" class="headerlink" title="Permanent link">&para;</a></h2>
 
 
     <div class="doc doc-contents ">
+            <p class="doc doc-class-bases">
+              Bases: <code><a class="autorefs autorefs-internal" title="metadata_utils.OpenMLObjectHandler" href="#metadata_utils.OpenMLObjectHandler">OpenMLObjectHandler</a></code></p>
 
-      <p>Description: Combine the descriptions with the metadata table.</p>
-<p>Input: all_dataset_metadata (pd.DataFrame) : The metadata table,
-all_data_description_df (pd.DataFrame) : The descriptions</p>
-<p>Returns: The combined metadata table.</p>
 
-            <details class="quote">
-              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">221</span>
-<span class="normal">222</span>
-<span class="normal">223</span>
-<span class="normal">224</span>
-<span class="normal">225</span>
-<span class="normal">226</span>
-<span class="normal">227</span>
-<span class="normal">228</span>
-<span class="normal">229</span>
-<span class="normal">230</span>
-<span class="normal">231</span>
-<span class="normal">232</span>
-<span class="normal">233</span>
-<span class="normal">234</span>
-<span class="normal">235</span>
-<span class="normal">236</span>
-<span class="normal">237</span>
-<span class="normal">238</span>
-<span class="normal">239</span>
-<span class="normal">240</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">combine_metadata</span><span class="p">(</span><span class="n">all_dataset_metadata</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">all_data_description_df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
+      <p>Description: The class for handling OpenML dataset objects.</p>
+
+              <details class="quote">
+                <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
+                <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">116</span>
+<span class="normal">117</span>
+<span class="normal">118</span>
+<span class="normal">119</span>
+<span class="normal">120</span>
+<span class="normal">121</span>
+<span class="normal">122</span>
+<span class="normal">123</span>
+<span class="normal">124</span>
+<span class="normal">125</span>
+<span class="normal">126</span>
+<span class="normal">127</span>
+<span class="normal">128</span>
+<span class="normal">129</span>
+<span class="normal">130</span>
+<span class="normal">131</span>
+<span class="normal">132</span>
+<span class="normal">133</span>
+<span class="normal">134</span>
+<span class="normal">135</span>
+<span class="normal">136</span>
+<span class="normal">137</span>
+<span class="normal">138</span>
+<span class="normal">139</span>
+<span class="normal">140</span>
+<span class="normal">141</span>
+<span class="normal">142</span>
+<span class="normal">143</span>
+<span class="normal">144</span>
+<span class="normal">145</span>
+<span class="normal">146</span>
+<span class="normal">147</span>
+<span class="normal">148</span>
+<span class="normal">149</span>
+<span class="normal">150</span>
+<span class="normal">151</span>
+<span class="normal">152</span>
+<span class="normal">153</span>
+<span class="normal">154</span>
+<span class="normal">155</span>
+<span class="normal">156</span>
+<span class="normal">157</span>
+<span class="normal">158</span>
+<span class="normal">159</span>
+<span class="normal">160</span>
+<span class="normal">161</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">class</span> <span class="nc">OpenMLDatasetHandler</span><span class="p">(</span><span class="n">OpenMLObjectHandler</span><span class="p">):</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
-<span class="sd">    Description: Combine the descriptions with the metadata table.</span>
+<span class="sd">    Description: The class for handling OpenML dataset objects.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
 
-<span class="sd">    Input: all_dataset_metadata (pd.DataFrame) : The metadata table,</span>
-<span class="sd">    all_data_description_df (pd.DataFrame) : The descriptions</span>
+    <span class="k">def</span> <span class="nf">get_description</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">data_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
+        <span class="k">return</span> <span class="n">openml</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">get_dataset</span><span class="p">(</span>
+            <span class="n">dataset_id</span><span class="o">=</span><span class="n">data_id</span><span class="p">,</span>
+            <span class="n">download_data</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+            <span class="n">download_qualities</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+            <span class="n">download_features_meta_data</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+        <span class="p">)</span>
 
-<span class="sd">    Returns: The combined metadata table.</span>
-<span class="sd">    &quot;&quot;&quot;</span>
-    <span class="c1"># Combine the descriptions with the metadata table</span>
-    <span class="n">all_dataset_metadata</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span>
-        <span class="n">all_dataset_metadata</span><span class="p">,</span> <span class="n">all_data_description_df</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s2">&quot;did&quot;</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s2">&quot;inner&quot;</span>
-    <span class="p">)</span>
+    <span class="k">def</span> <span class="nf">get_openml_objects</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+        <span class="k">return</span> <span class="n">openml</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">list_datasets</span><span class="p">(</span><span class="n">output_format</span><span class="o">=</span><span class="s2">&quot;dataframe&quot;</span><span class="p">)</span>
 
-    <span class="c1"># Create a single column that has a combined string of all the metadata and the description in the form of &quot;column - value, column - value, ... description&quot;</span>
+    <span class="k">def</span> <span class="nf">process_metadata</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">openml_data_object</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">openml</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">dataset</span><span class="o">.</span><span class="n">OpenMLDataset</span><span class="p">],</span>
+        <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span>
+        <span class="n">all_dataset_metadata</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span>
+        <span class="n">file_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+    <span class="p">):</span>
+        <span class="n">descriptions</span> <span class="o">=</span> <span class="p">[</span>
+            <span class="n">extract_attribute</span><span class="p">(</span><span class="n">attr</span><span class="p">,</span> <span class="s2">&quot;description&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="n">openml_data_object</span>
+        <span class="p">]</span>
+        <span class="n">joined_qualities</span> <span class="o">=</span> <span class="p">[</span>
+            <span class="n">join_attributes</span><span class="p">(</span><span class="n">attr</span><span class="p">,</span> <span class="s2">&quot;qualities&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="n">openml_data_object</span>
+        <span class="p">]</span>
+        <span class="n">joined_features</span> <span class="o">=</span> <span class="p">[</span>
+            <span class="n">join_attributes</span><span class="p">(</span><span class="n">attr</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="n">openml_data_object</span>
+        <span class="p">]</span>
+
+        <span class="n">all_data_description_df</span> <span class="o">=</span> <span class="n">create_combined_information_df</span><span class="p">(</span>
+            <span class="n">data_id</span><span class="p">,</span> <span class="n">descriptions</span><span class="p">,</span> <span class="n">joined_qualities</span><span class="p">,</span> <span class="n">joined_features</span>
+        <span class="p">)</span>
+        <span class="n">all_dataset_metadata</span> <span class="o">=</span> <span class="n">combine_metadata</span><span class="p">(</span>
+            <span class="n">all_dataset_metadata</span><span class="p">,</span> <span class="n">all_data_description_df</span>
+        <span class="p">)</span>
 
-    <span class="n">all_dataset_metadata</span><span class="p">[</span><span class="s2">&quot;Combined_information&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">all_dataset_metadata</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span>
-        <span class="n">merge_all_columns_to_string</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span>
-    <span class="p">)</span>
-    <span class="k">return</span> <span class="n">all_dataset_metadata</span>
+        <span class="n">all_dataset_metadata</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span>
+
+        <span class="k">return</span> <span class="p">(</span>
+            <span class="n">all_dataset_metadata</span><span class="p">[[</span><span class="s2">&quot;did&quot;</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">,</span> <span class="s2">&quot;Combined_information&quot;</span><span class="p">]],</span>
+            <span class="n">all_dataset_metadata</span><span class="p">,</span>
+        <span class="p">)</span>
 </code></pre></div></td></tr></table></div>
-            </details>
-    </div>
+              </details>
 
-</div>
 
-<div class="doc doc-object doc-function">
 
+  <div class="doc doc-children">
 
-<h2 id="metadata_utils.create_combined_information_df" class="doc doc-heading">
-            <code class="highlight language-python"><span class="n">create_combined_information_df</span><span class="p">(</span><span class="n">data_id</span><span class="p">,</span> <span class="n">descriptions</span><span class="p">,</span> <span class="n">joined_qualities</span><span class="p">,</span> <span class="n">joined_features</span><span class="p">)</span></code>
 
-<a href="#metadata_utils.create_combined_information_df" class="headerlink" title="Permanent link">&para;</a></h2>
 
 
-    <div class="doc doc-contents ">
 
-      <p>Description: Create a dataframe with the combined information of the OpenML object.</p>
-<p>Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object</p>
-<p>Returns: The dataframe with the combined information of the OpenML object.</p>
 
-            <details class="quote">
-              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">187</span>
-<span class="normal">188</span>
-<span class="normal">189</span>
-<span class="normal">190</span>
-<span class="normal">191</span>
-<span class="normal">192</span>
-<span class="normal">193</span>
-<span class="normal">194</span>
-<span class="normal">195</span>
+
+
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-class">
+
+
+
+<h2 id="metadata_utils.OpenMLFlowHandler" class="doc doc-heading">
+            <code>OpenMLFlowHandler</code>
+
+
+<a href="#metadata_utils.OpenMLFlowHandler" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+            <p class="doc doc-class-bases">
+              Bases: <code><a class="autorefs autorefs-internal" title="metadata_utils.OpenMLObjectHandler" href="#metadata_utils.OpenMLObjectHandler">OpenMLObjectHandler</a></code></p>
+
+
+      <p>Description: The class for handling OpenML flow objects.</p>
+
+              <details class="quote">
+                <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
+                <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">164</span>
+<span class="normal">165</span>
+<span class="normal">166</span>
+<span class="normal">167</span>
+<span class="normal">168</span>
+<span class="normal">169</span>
+<span class="normal">170</span>
+<span class="normal">171</span>
+<span class="normal">172</span>
+<span class="normal">173</span>
+<span class="normal">174</span>
+<span class="normal">175</span>
+<span class="normal">176</span>
+<span class="normal">177</span>
+<span class="normal">178</span>
+<span class="normal">179</span>
+<span class="normal">180</span>
+<span class="normal">181</span>
+<span class="normal">182</span>
+<span class="normal">183</span>
+<span class="normal">184</span>
+<span class="normal">185</span>
+<span class="normal">186</span>
+<span class="normal">187</span>
+<span class="normal">188</span>
+<span class="normal">189</span>
+<span class="normal">190</span>
+<span class="normal">191</span>
+<span class="normal">192</span>
+<span class="normal">193</span>
+<span class="normal">194</span>
+<span class="normal">195</span>
 <span class="normal">196</span>
 <span class="normal">197</span>
 <span class="normal">198</span>
@@ -1129,9 +1293,653 @@ <h2 id="metadata_utils.create_combined_information_df" class="doc doc-heading">
 <span class="normal">202</span>
 <span class="normal">203</span>
 <span class="normal">204</span>
-<span class="normal">205</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">create_combined_information_df</span><span class="p">(</span>
+<span class="normal">205</span>
+<span class="normal">206</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">class</span> <span class="nc">OpenMLFlowHandler</span><span class="p">(</span><span class="n">OpenMLObjectHandler</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: The class for handling OpenML flow objects.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="k">def</span> <span class="nf">get_description</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">data_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
+        <span class="k">return</span> <span class="n">openml</span><span class="o">.</span><span class="n">flows</span><span class="o">.</span><span class="n">get_flow</span><span class="p">(</span><span class="n">flow_id</span><span class="o">=</span><span class="n">data_id</span><span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">get_openml_objects</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+        <span class="n">all_objects</span> <span class="o">=</span> <span class="n">openml</span><span class="o">.</span><span class="n">flows</span><span class="o">.</span><span class="n">list_flows</span><span class="p">(</span><span class="n">output_format</span><span class="o">=</span><span class="s2">&quot;dataframe&quot;</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">all_objects</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;id&quot;</span><span class="p">:</span> <span class="s2">&quot;did&quot;</span><span class="p">})</span>
+
+    <span class="k">def</span> <span class="nf">process_metadata</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">openml_data_object</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">openml</span><span class="o">.</span><span class="n">flows</span><span class="o">.</span><span class="n">flow</span><span class="o">.</span><span class="n">OpenMLFlow</span><span class="p">],</span>
+        <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span>
+        <span class="n">all_dataset_metadata</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span>
+        <span class="n">file_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+    <span class="p">):</span>
+        <span class="n">descriptions</span> <span class="o">=</span> <span class="p">[</span>
+            <span class="n">extract_attribute</span><span class="p">(</span><span class="n">attr</span><span class="p">,</span> <span class="s2">&quot;description&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="n">openml_data_object</span>
+        <span class="p">]</span>
+        <span class="n">names</span> <span class="o">=</span> <span class="p">[</span><span class="n">extract_attribute</span><span class="p">(</span><span class="n">attr</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="n">openml_data_object</span><span class="p">]</span>
+        <span class="n">tags</span> <span class="o">=</span> <span class="p">[</span><span class="n">extract_attribute</span><span class="p">(</span><span class="n">attr</span><span class="p">,</span> <span class="s2">&quot;tags&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="n">openml_data_object</span><span class="p">]</span>
+
+        <span class="n">all_data_description_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span>
+            <span class="p">{</span>
+                <span class="s2">&quot;did&quot;</span><span class="p">:</span> <span class="n">data_id</span><span class="p">,</span>
+                <span class="s2">&quot;description&quot;</span><span class="p">:</span> <span class="n">descriptions</span><span class="p">,</span>
+                <span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="n">names</span><span class="p">,</span>
+                <span class="s2">&quot;tags&quot;</span><span class="p">:</span> <span class="n">tags</span><span class="p">,</span>
+            <span class="p">}</span>
+        <span class="p">)</span>
+
+        <span class="n">all_data_description_df</span><span class="p">[</span><span class="s2">&quot;Combined_information&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">all_data_description_df</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span>
+            <span class="n">merge_all_columns_to_string</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span>
+        <span class="p">)</span>
+        <span class="n">all_data_description_df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span>
+
+        <span class="k">return</span> <span class="p">(</span>
+            <span class="n">all_data_description_df</span><span class="p">[[</span><span class="s2">&quot;did&quot;</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">,</span> <span class="s2">&quot;Combined_information&quot;</span><span class="p">]],</span>
+            <span class="n">all_data_description_df</span><span class="p">,</span>
+        <span class="p">)</span>
+</code></pre></div></td></tr></table></div>
+              </details>
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-class">
+
+
+
+<h2 id="metadata_utils.OpenMLObjectHandler" class="doc doc-heading">
+            <code>OpenMLObjectHandler</code>
+
+
+<a href="#metadata_utils.OpenMLObjectHandler" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+
+      <p>Description: The base class for handling OpenML objects.</p>
+
+              <details class="quote">
+                <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
+                <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 33</span>
+<span class="normal"> 34</span>
+<span class="normal"> 35</span>
+<span class="normal"> 36</span>
+<span class="normal"> 37</span>
+<span class="normal"> 38</span>
+<span class="normal"> 39</span>
+<span class="normal"> 40</span>
+<span class="normal"> 41</span>
+<span class="normal"> 42</span>
+<span class="normal"> 43</span>
+<span class="normal"> 44</span>
+<span class="normal"> 45</span>
+<span class="normal"> 46</span>
+<span class="normal"> 47</span>
+<span class="normal"> 48</span>
+<span class="normal"> 49</span>
+<span class="normal"> 50</span>
+<span class="normal"> 51</span>
+<span class="normal"> 52</span>
+<span class="normal"> 53</span>
+<span class="normal"> 54</span>
+<span class="normal"> 55</span>
+<span class="normal"> 56</span>
+<span class="normal"> 57</span>
+<span class="normal"> 58</span>
+<span class="normal"> 59</span>
+<span class="normal"> 60</span>
+<span class="normal"> 61</span>
+<span class="normal"> 62</span>
+<span class="normal"> 63</span>
+<span class="normal"> 64</span>
+<span class="normal"> 65</span>
+<span class="normal"> 66</span>
+<span class="normal"> 67</span>
+<span class="normal"> 68</span>
+<span class="normal"> 69</span>
+<span class="normal"> 70</span>
+<span class="normal"> 71</span>
+<span class="normal"> 72</span>
+<span class="normal"> 73</span>
+<span class="normal"> 74</span>
+<span class="normal"> 75</span>
+<span class="normal"> 76</span>
+<span class="normal"> 77</span>
+<span class="normal"> 78</span>
+<span class="normal"> 79</span>
+<span class="normal"> 80</span>
+<span class="normal"> 81</span>
+<span class="normal"> 82</span>
+<span class="normal"> 83</span>
+<span class="normal"> 84</span>
+<span class="normal"> 85</span>
+<span class="normal"> 86</span>
+<span class="normal"> 87</span>
+<span class="normal"> 88</span>
+<span class="normal"> 89</span>
+<span class="normal"> 90</span>
+<span class="normal"> 91</span>
+<span class="normal"> 92</span>
+<span class="normal"> 93</span>
+<span class="normal"> 94</span>
+<span class="normal"> 95</span>
+<span class="normal"> 96</span>
+<span class="normal"> 97</span>
+<span class="normal"> 98</span>
+<span class="normal"> 99</span>
+<span class="normal">100</span>
+<span class="normal">101</span>
+<span class="normal">102</span>
+<span class="normal">103</span>
+<span class="normal">104</span>
+<span class="normal">105</span>
+<span class="normal">106</span>
+<span class="normal">107</span>
+<span class="normal">108</span>
+<span class="normal">109</span>
+<span class="normal">110</span>
+<span class="normal">111</span>
+<span class="normal">112</span>
+<span class="normal">113</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">class</span> <span class="nc">OpenMLObjectHandler</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: The base class for handling OpenML objects.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">config</span><span class="p">):</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">config</span> <span class="o">=</span> <span class="n">config</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">collection_name</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span>
+
+    <span class="k">def</span> <span class="nf">get_description</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">data_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Description: Get the description of the OpenML object.</span>
+
+<span class="sd">        Input: data_id (int) : The data id</span>
+
+<span class="sd">        Returns: The OpenML object.</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">raise</span> <span class="ne">NotImplementedError</span>
+
+    <span class="k">def</span> <span class="nf">get_openml_objects</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Description: Get the OpenML objects.</span>
+
+<span class="sd">        Input: None</span>
+
+<span class="sd">        Returns: The OpenML objects.</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">raise</span> <span class="ne">NotImplementedError</span>
+
+    <span class="k">def</span> <span class="nf">initialize_cache</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Description: Initialize the cache for the OpenML objects.</span>
+
+<span class="sd">        Input: data_id (list) : The list of data ids</span>
+
+<span class="sd">        Returns: None</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">get_description</span><span class="p">(</span><span class="n">data_id</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
+
+    <span class="k">def</span> <span class="nf">get_metadata</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">]):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Description: Get metadata from OpenML using parallel processing.</span>
+
+<span class="sd">        Input: data_id (list) : The list of data ids</span>
+
+<span class="sd">        Returns: The OpenML objects.</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">return</span> <span class="n">pqdm</span><span class="p">(</span>
+            <span class="n">data_id</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_description</span><span class="p">,</span> <span class="n">n_jobs</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">[</span><span class="s2">&quot;data_download_n_jobs&quot;</span><span class="p">]</span>
+        <span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">process_metadata</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">openml_data_object</span><span class="p">,</span>
+        <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span>
+        <span class="n">all_dataset_metadata</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span>
+        <span class="n">file_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+    <span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Description: Process the metadata.</span>
+
+<span class="sd">        Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path</span>
+
+<span class="sd">        Returns: The combined metadata dataframe and the updated metadata table.</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">raise</span> <span class="ne">NotImplementedError</span>
+
+    <span class="k">def</span> <span class="nf">load_metadata</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Description: Load metadata from a file.</span>
+
+<span class="sd">        Input: file_path (str) : The file path</span>
+
+<span class="sd">        Returns: The metadata dataframe.</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">try</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span>
+        <span class="k">except</span> <span class="ne">FileNotFoundError</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span>
+                <span class="s2">&quot;Metadata files do not exist. Please run the training pipeline first.&quot;</span>
+            <span class="p">)</span>
+</code></pre></div></td></tr></table></div>
+              </details>
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="metadata_utils.OpenMLObjectHandler.get_description" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">get_description</span><span class="p">(</span><span class="n">data_id</span><span class="p">)</span></code>
+
+<a href="#metadata_utils.OpenMLObjectHandler.get_description" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Get the description of the OpenML object.</p>
+<p>Input: data_id (int) : The data id</p>
+<p>Returns: The OpenML object.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">42</span>
+<span class="normal">43</span>
+<span class="normal">44</span>
+<span class="normal">45</span>
+<span class="normal">46</span>
+<span class="normal">47</span>
+<span class="normal">48</span>
+<span class="normal">49</span>
+<span class="normal">50</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_description</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">data_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Get the description of the OpenML object.</span>
+
+<span class="sd">    Input: data_id (int) : The data id</span>
+
+<span class="sd">    Returns: The OpenML object.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">raise</span> <span class="ne">NotImplementedError</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="metadata_utils.OpenMLObjectHandler.get_metadata" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">get_metadata</span><span class="p">(</span><span class="n">data_id</span><span class="p">)</span></code>
+
+<a href="#metadata_utils.OpenMLObjectHandler.get_metadata" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Get metadata from OpenML using parallel processing.</p>
+<p>Input: data_id (list) : The list of data ids</p>
+<p>Returns: The OpenML objects.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">72</span>
+<span class="normal">73</span>
+<span class="normal">74</span>
+<span class="normal">75</span>
+<span class="normal">76</span>
+<span class="normal">77</span>
+<span class="normal">78</span>
+<span class="normal">79</span>
+<span class="normal">80</span>
+<span class="normal">81</span>
+<span class="normal">82</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_metadata</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">]):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Get metadata from OpenML using parallel processing.</span>
+
+<span class="sd">    Input: data_id (list) : The list of data ids</span>
+
+<span class="sd">    Returns: The OpenML objects.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">return</span> <span class="n">pqdm</span><span class="p">(</span>
+        <span class="n">data_id</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_description</span><span class="p">,</span> <span class="n">n_jobs</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">[</span><span class="s2">&quot;data_download_n_jobs&quot;</span><span class="p">]</span>
+    <span class="p">)</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="metadata_utils.OpenMLObjectHandler.get_openml_objects" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">get_openml_objects</span><span class="p">()</span></code>
+
+<a href="#metadata_utils.OpenMLObjectHandler.get_openml_objects" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Get the OpenML objects.</p>
+<p>Input: None</p>
+<p>Returns: The OpenML objects.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">52</span>
+<span class="normal">53</span>
+<span class="normal">54</span>
+<span class="normal">55</span>
+<span class="normal">56</span>
+<span class="normal">57</span>
+<span class="normal">58</span>
+<span class="normal">59</span>
+<span class="normal">60</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_openml_objects</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Get the OpenML objects.</span>
+
+<span class="sd">    Input: None</span>
+
+<span class="sd">    Returns: The OpenML objects.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">raise</span> <span class="ne">NotImplementedError</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="metadata_utils.OpenMLObjectHandler.initialize_cache" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">initialize_cache</span><span class="p">(</span><span class="n">data_id</span><span class="p">)</span></code>
+
+<a href="#metadata_utils.OpenMLObjectHandler.initialize_cache" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Initialize the cache for the OpenML objects.</p>
+<p>Input: data_id (list) : The list of data ids</p>
+<p>Returns: None</p>
+
+            <details class="quote">
+              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">62</span>
+<span class="normal">63</span>
+<span class="normal">64</span>
+<span class="normal">65</span>
+<span class="normal">66</span>
+<span class="normal">67</span>
+<span class="normal">68</span>
+<span class="normal">69</span>
+<span class="normal">70</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">initialize_cache</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Initialize the cache for the OpenML objects.</span>
+
+<span class="sd">    Input: data_id (list) : The list of data ids</span>
+
+<span class="sd">    Returns: None</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">get_description</span><span class="p">(</span><span class="n">data_id</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="metadata_utils.OpenMLObjectHandler.load_metadata" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">load_metadata</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span></code>
+
+<a href="#metadata_utils.OpenMLObjectHandler.load_metadata" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Load metadata from a file.</p>
+<p>Input: file_path (str) : The file path</p>
+<p>Returns: The metadata dataframe.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">100</span>
+<span class="normal">101</span>
+<span class="normal">102</span>
+<span class="normal">103</span>
+<span class="normal">104</span>
+<span class="normal">105</span>
+<span class="normal">106</span>
+<span class="normal">107</span>
+<span class="normal">108</span>
+<span class="normal">109</span>
+<span class="normal">110</span>
+<span class="normal">111</span>
+<span class="normal">112</span>
+<span class="normal">113</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">load_metadata</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Load metadata from a file.</span>
+
+<span class="sd">    Input: file_path (str) : The file path</span>
+
+<span class="sd">    Returns: The metadata dataframe.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">try</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span>
+    <span class="k">except</span> <span class="ne">FileNotFoundError</span><span class="p">:</span>
+        <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span>
+            <span class="s2">&quot;Metadata files do not exist. Please run the training pipeline first.&quot;</span>
+        <span class="p">)</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="metadata_utils.OpenMLObjectHandler.process_metadata" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">process_metadata</span><span class="p">(</span><span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_dataset_metadata</span><span class="p">,</span> <span class="n">file_path</span><span class="p">)</span></code>
+
+<a href="#metadata_utils.OpenMLObjectHandler.process_metadata" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Process the metadata.</p>
+<p>Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path</p>
+<p>Returns: The combined metadata dataframe and the updated metadata table.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">84</span>
+<span class="normal">85</span>
+<span class="normal">86</span>
+<span class="normal">87</span>
+<span class="normal">88</span>
+<span class="normal">89</span>
+<span class="normal">90</span>
+<span class="normal">91</span>
+<span class="normal">92</span>
+<span class="normal">93</span>
+<span class="normal">94</span>
+<span class="normal">95</span>
+<span class="normal">96</span>
+<span class="normal">97</span>
+<span class="normal">98</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">process_metadata</span><span class="p">(</span>
+    <span class="bp">self</span><span class="p">,</span>
+    <span class="n">openml_data_object</span><span class="p">,</span>
+    <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span>
+    <span class="n">all_dataset_metadata</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span>
+    <span class="n">file_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Process the metadata.</span>
+
+<span class="sd">    Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path</span>
+
+<span class="sd">    Returns: The combined metadata dataframe and the updated metadata table.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">raise</span> <span class="ne">NotImplementedError</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="metadata_utils.combine_metadata" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">combine_metadata</span><span class="p">(</span><span class="n">all_dataset_metadata</span><span class="p">,</span> <span class="n">all_data_description_df</span><span class="p">)</span></code>
+
+<a href="#metadata_utils.combine_metadata" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Combine the descriptions with the metadata table.</p>
+<p>Input: all_dataset_metadata (pd.DataFrame) : The metadata table,
+all_data_description_df (pd.DataFrame) : The descriptions</p>
+<p>Returns: The combined metadata table.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">345</span>
+<span class="normal">346</span>
+<span class="normal">347</span>
+<span class="normal">348</span>
+<span class="normal">349</span>
+<span class="normal">350</span>
+<span class="normal">351</span>
+<span class="normal">352</span>
+<span class="normal">353</span>
+<span class="normal">354</span>
+<span class="normal">355</span>
+<span class="normal">356</span>
+<span class="normal">357</span>
+<span class="normal">358</span>
+<span class="normal">359</span>
+<span class="normal">360</span>
+<span class="normal">361</span>
+<span class="normal">362</span>
+<span class="normal">363</span>
+<span class="normal">364</span>
+<span class="normal">365</span>
+<span class="normal">366</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">combine_metadata</span><span class="p">(</span>
+    <span class="n">all_dataset_metadata</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">all_data_description_df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Combine the descriptions with the metadata table.</span>
+
+<span class="sd">    Input: all_dataset_metadata (pd.DataFrame) : The metadata table,</span>
+<span class="sd">    all_data_description_df (pd.DataFrame) : The descriptions</span>
+
+<span class="sd">    Returns: The combined metadata table.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="c1"># Combine the descriptions with the metadata table</span>
+    <span class="n">all_dataset_metadata</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span>
+        <span class="n">all_dataset_metadata</span><span class="p">,</span> <span class="n">all_data_description_df</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s2">&quot;did&quot;</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s2">&quot;inner&quot;</span>
+    <span class="p">)</span>
+
+    <span class="c1"># Create a single column that has a combined string of all the metadata and the description in the form of &quot;column - value, column - value, ... description&quot;</span>
+
+    <span class="n">all_dataset_metadata</span><span class="p">[</span><span class="s2">&quot;Combined_information&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">all_dataset_metadata</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span>
+        <span class="n">merge_all_columns_to_string</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span>
+    <span class="p">)</span>
+    <span class="k">return</span> <span class="n">all_dataset_metadata</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="metadata_utils.create_combined_information_df" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">create_combined_information_df</span><span class="p">(</span><span class="n">data_id</span><span class="p">,</span> <span class="n">descriptions</span><span class="p">,</span> <span class="n">joined_qualities</span><span class="p">,</span> <span class="n">joined_features</span><span class="p">)</span></code>
+
+<a href="#metadata_utils.create_combined_information_df" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Create a dataframe with the combined information of the OpenML object.</p>
+<p>Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object</p>
+<p>Returns: The dataframe with the combined information of the OpenML object.</p>
+
+            <details class="quote">
+              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">308</span>
+<span class="normal">309</span>
+<span class="normal">310</span>
+<span class="normal">311</span>
+<span class="normal">312</span>
+<span class="normal">313</span>
+<span class="normal">314</span>
+<span class="normal">315</span>
+<span class="normal">316</span>
+<span class="normal">317</span>
+<span class="normal">318</span>
+<span class="normal">319</span>
+<span class="normal">320</span>
+<span class="normal">321</span>
+<span class="normal">322</span>
+<span class="normal">323</span>
+<span class="normal">324</span>
+<span class="normal">325</span>
+<span class="normal">326</span>
+<span class="normal">327</span>
+<span class="normal">328</span>
+<span class="normal">329</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">create_combined_information_df</span><span class="p">(</span>
     <span class="c1"># data_id, descriptions, joined_qualities, joined_features</span>
-    <span class="n">data_id</span><span class="p">:</span> <span class="nb">int</span><span class="o">|</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">descriptions</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">joined_qualities</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">joined_features</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span>
+    <span class="n">data_id</span><span class="p">:</span> <span class="nb">int</span> <span class="o">|</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span>
+    <span class="n">descriptions</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span>
+    <span class="n">joined_qualities</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span>
+    <span class="n">joined_features</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span>
 <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Create a dataframe with the combined information of the OpenML object.</span>
@@ -1158,90 +1966,28 @@ <h2 id="metadata_utils.create_combined_information_df" class="doc doc-heading">
 
 
 <h2 id="metadata_utils.create_metadata_dataframe" class="doc doc-heading">
-            <code class="highlight language-python"><span class="n">create_metadata_dataframe</span><span class="p">(</span><span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_dataset_metadata</span><span class="p">,</span> <span class="n">config</span><span class="p">)</span></code>
+            <code class="highlight language-python"><span class="n">create_metadata_dataframe</span><span class="p">(</span><span class="n">handler</span><span class="p">,</span> <span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_dataset_metadata</span><span class="p">,</span> <span class="n">config</span><span class="p">)</span></code>
 
 <a href="#metadata_utils.create_metadata_dataframe" class="headerlink" title="Permanent link">&para;</a></h2>
 
 
-    <div class="doc doc-contents ">
-
-      <p>Creates a dataframe with all the metadata, joined columns with all information
-for the type of data specified in the config. If training is set to False,
-the dataframes are loaded from the files. If training is set to True, the
-dataframes are created and then saved to the files.</p>
-
-
-<p><span class="doc-section-title">Parameters:</span></p>
-    <table>
-      <thead>
-        <tr>
-          <th>Name</th>
-          <th>Type</th>
-          <th>Description</th>
-          <th>Default</th>
-        </tr>
-      </thead>
-      <tbody>
-          <tr class="doc-section-item">
-            <td><code>openml_data_object</code></td>
-            <td>
-                  <code>list</code>
-            </td>
-            <td>
-              <div class="doc-md-description">
-                <p>The list of OpenML objects.</p>
-              </div>
-            </td>
-            <td>
-                <em>required</em>
-            </td>
-          </tr>
-          <tr class="doc-section-item">
-            <td><code>data_id</code></td>
-            <td>
-                  <code>list</code>
-            </td>
-            <td>
-              <div class="doc-md-description">
-                <p>The list of data ids.</p>
-              </div>
-            </td>
-            <td>
-                <em>required</em>
-            </td>
-          </tr>
-          <tr class="doc-section-item">
-            <td><code>all_dataset_metadata</code></td>
-            <td>
-                  <code><span title="pandas.DataFrame">DataFrame</span></code>
-            </td>
-            <td>
-              <div class="doc-md-description">
-                <p>The metadata table.</p>
-              </div>
-            </td>
-            <td>
-                <em>required</em>
-            </td>
-          </tr>
-          <tr class="doc-section-item">
-            <td><code>config</code></td>
-            <td>
-                  <code>dict</code>
-            </td>
-            <td>
-              <div class="doc-md-description">
-                <p>The config dictionary.</p>
-              </div>
-            </td>
-            <td>
-                <em>required</em>
-            </td>
-          </tr>
-      </tbody>
-    </table>
-
-
+    <div class="doc doc-contents ">
+
+      <p>Description: Creates a dataframe with all the metadata, joined columns with all information
+for the type of data specified in the config. If training is set to False,
+the dataframes are loaded from the files. If training is set to True, the
+dataframes are created and then saved to the files.</p>
+
+
+<details class="input" open>
+  <summary>Input</summary>
+  <p>handler (OpenMLObjectHandler): The handler for the OpenML objects.
+openml_data_object (list): The list of OpenML objects.
+data_id (list): The list of data ids.
+all_dataset_metadata (pd.DataFrame): The metadata table.
+config (dict): The config dictionary.</p>
+</details>
+
     <p><span class="doc-section-title">Returns:</span></p>
     <table>
       <thead>
@@ -1276,53 +2022,61 @@ <h2 id="metadata_utils.create_metadata_dataframe" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">320</span>
-<span class="normal">321</span>
-<span class="normal">322</span>
-<span class="normal">323</span>
-<span class="normal">324</span>
-<span class="normal">325</span>
-<span class="normal">326</span>
-<span class="normal">327</span>
-<span class="normal">328</span>
-<span class="normal">329</span>
-<span class="normal">330</span>
-<span class="normal">331</span>
-<span class="normal">332</span>
-<span class="normal">333</span>
-<span class="normal">334</span>
-<span class="normal">335</span>
-<span class="normal">336</span>
-<span class="normal">337</span>
-<span class="normal">338</span>
-<span class="normal">339</span>
-<span class="normal">340</span>
-<span class="normal">341</span>
-<span class="normal">342</span>
-<span class="normal">343</span>
-<span class="normal">344</span>
-<span class="normal">345</span>
-<span class="normal">346</span>
-<span class="normal">347</span>
-<span class="normal">348</span>
-<span class="normal">349</span>
-<span class="normal">350</span>
-<span class="normal">351</span>
-<span class="normal">352</span>
-<span class="normal">353</span>
-<span class="normal">354</span>
-<span class="normal">355</span>
-<span class="normal">356</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">create_metadata_dataframe</span><span class="p">(</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">369</span>
+<span class="normal">370</span>
+<span class="normal">371</span>
+<span class="normal">372</span>
+<span class="normal">373</span>
+<span class="normal">374</span>
+<span class="normal">375</span>
+<span class="normal">376</span>
+<span class="normal">377</span>
+<span class="normal">378</span>
+<span class="normal">379</span>
+<span class="normal">380</span>
+<span class="normal">381</span>
+<span class="normal">382</span>
+<span class="normal">383</span>
+<span class="normal">384</span>
+<span class="normal">385</span>
+<span class="normal">386</span>
+<span class="normal">387</span>
+<span class="normal">388</span>
+<span class="normal">389</span>
+<span class="normal">390</span>
+<span class="normal">391</span>
+<span class="normal">392</span>
+<span class="normal">393</span>
+<span class="normal">394</span>
+<span class="normal">395</span>
+<span class="normal">396</span>
+<span class="normal">397</span>
+<span class="normal">398</span>
+<span class="normal">399</span>
+<span class="normal">400</span>
+<span class="normal">401</span>
+<span class="normal">402</span>
+<span class="normal">403</span>
+<span class="normal">404</span>
+<span class="normal">405</span>
+<span class="normal">406</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">create_metadata_dataframe</span><span class="p">(</span>
     <span class="c1"># openml_data_object, data_id, all_dataset_metadata, config</span>
-    <span class="n">openml_data_object</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">openml</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">dataset</span><span class="o">.</span><span class="n">OpenMLDataset</span><span class="p">,</span> <span class="n">openml</span><span class="o">.</span><span class="n">flows</span><span class="o">.</span><span class="n">flow</span><span class="o">.</span><span class="n">OpenMLFlow</span><span class="p">]],</span> <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">all_dataset_metadata</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">config</span><span class="p">:</span> <span class="nb">dict</span>
+    <span class="n">handler</span><span class="p">:</span> <span class="n">OpenMLObjectHandler</span><span class="p">,</span>
+    <span class="n">openml_data_object</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span>
+        <span class="n">Union</span><span class="p">[</span><span class="n">openml</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">dataset</span><span class="o">.</span><span class="n">OpenMLDataset</span><span class="p">,</span> <span class="n">openml</span><span class="o">.</span><span class="n">flows</span><span class="o">.</span><span class="n">flow</span><span class="o">.</span><span class="n">OpenMLFlow</span><span class="p">]</span>
+    <span class="p">],</span>
+    <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span>
+    <span class="n">all_dataset_metadata</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span>
+    <span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span>
 <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
-<span class="sd">    Creates a dataframe with all the metadata, joined columns with all information</span>
+<span class="sd">    Description: Creates a dataframe with all the metadata, joined columns with all information</span>
 <span class="sd">    for the type of data specified in the config. If training is set to False,</span>
 <span class="sd">    the dataframes are loaded from the files. If training is set to True, the</span>
 <span class="sd">    dataframes are created and then saved to the files.</span>
 
-<span class="sd">    Args:</span>
+<span class="sd">    Input:</span>
+<span class="sd">        handler (OpenMLObjectHandler): The handler for the OpenML objects.</span>
 <span class="sd">        openml_data_object (list): The list of OpenML objects.</span>
 <span class="sd">        data_id (list): The list of data ids.</span>
 <span class="sd">        all_dataset_metadata (pd.DataFrame): The metadata table.</span>
@@ -1338,17 +2092,11 @@ <h2 id="metadata_utils.create_metadata_dataframe" class="doc doc-heading">
     <span class="p">)</span>
 
     <span class="k">if</span> <span class="ow">not</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;training&quot;</span><span class="p">]:</span>
-        <span class="k">return</span> <span class="n">load_metadata</span><span class="p">(</span><span class="n">file_path</span><span class="p">),</span> <span class="n">all_dataset_metadata</span>
-
-    <span class="k">if</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;type_of_data&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;dataset&quot;</span><span class="p">:</span>
-        <span class="k">return</span> <span class="n">process_dataset_metadata</span><span class="p">(</span>
-            <span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_dataset_metadata</span><span class="p">,</span> <span class="n">file_path</span>
-        <span class="p">)</span>
+        <span class="k">return</span> <span class="n">handler</span><span class="o">.</span><span class="n">load_metadata</span><span class="p">(</span><span class="n">file_path</span><span class="p">),</span> <span class="n">all_dataset_metadata</span>
 
-    <span class="k">if</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;type_of_data&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;flow&quot;</span><span class="p">:</span>
-        <span class="k">return</span> <span class="n">process_flow_metadata</span><span class="p">(</span><span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">file_path</span><span class="p">)</span>
-
-    <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Unsupported type_of_data: </span><span class="si">{</span><span class="n">config</span><span class="p">[</span><span class="s1">&#39;type_of_data&#39;</span><span class="p">]</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">handler</span><span class="o">.</span><span class="n">process_metadata</span><span class="p">(</span>
+        <span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_dataset_metadata</span><span class="p">,</span> <span class="n">file_path</span>
+    <span class="p">)</span>
 </code></pre></div></td></tr></table></div>
             </details>
     </div>
@@ -1372,15 +2120,15 @@ <h2 id="metadata_utils.extract_attribute" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">159</span>
-<span class="normal">160</span>
-<span class="normal">161</span>
-<span class="normal">162</span>
-<span class="normal">163</span>
-<span class="normal">164</span>
-<span class="normal">165</span>
-<span class="normal">166</span>
-<span class="normal">167</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">extract_attribute</span><span class="p">(</span><span class="n">attribute</span><span class="p">:</span> <span class="nb">object</span><span class="p">,</span> <span class="n">attr_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">280</span>
+<span class="normal">281</span>
+<span class="normal">282</span>
+<span class="normal">283</span>
+<span class="normal">284</span>
+<span class="normal">285</span>
+<span class="normal">286</span>
+<span class="normal">287</span>
+<span class="normal">288</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">extract_attribute</span><span class="p">(</span><span class="n">attribute</span><span class="p">:</span> <span class="nb">object</span><span class="p">,</span> <span class="n">attr_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Extract an attribute from the OpenML object.</span>
 
@@ -1414,333 +2162,133 @@ <h2 id="metadata_utils.get_all_metadata_from_openml" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">100</span>
-<span class="normal">101</span>
-<span class="normal">102</span>
-<span class="normal">103</span>
-<span class="normal">104</span>
-<span class="normal">105</span>
-<span class="normal">106</span>
-<span class="normal">107</span>
-<span class="normal">108</span>
-<span class="normal">109</span>
-<span class="normal">110</span>
-<span class="normal">111</span>
-<span class="normal">112</span>
-<span class="normal">113</span>
-<span class="normal">114</span>
-<span class="normal">115</span>
-<span class="normal">116</span>
-<span class="normal">117</span>
-<span class="normal">118</span>
-<span class="normal">119</span>
-<span class="normal">120</span>
-<span class="normal">121</span>
-<span class="normal">122</span>
-<span class="normal">123</span>
-<span class="normal">124</span>
-<span class="normal">125</span>
-<span class="normal">126</span>
-<span class="normal">127</span>
-<span class="normal">128</span>
-<span class="normal">129</span>
-<span class="normal">130</span>
-<span class="normal">131</span>
-<span class="normal">132</span>
-<span class="normal">133</span>
-<span class="normal">134</span>
-<span class="normal">135</span>
-<span class="normal">136</span>
-<span class="normal">137</span>
-<span class="normal">138</span>
-<span class="normal">139</span>
-<span class="normal">140</span>
-<span class="normal">141</span>
-<span class="normal">142</span>
-<span class="normal">143</span>
-<span class="normal">144</span>
-<span class="normal">145</span>
-<span class="normal">146</span>
-<span class="normal">147</span>
-<span class="normal">148</span>
-<span class="normal">149</span>
-<span class="normal">150</span>
-<span class="normal">151</span>
-<span class="normal">152</span>
-<span class="normal">153</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_all_metadata_from_openml</span><span class="p">(</span><span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
-<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
-<span class="sd">    Description: Gets all the metadata from OpenML for the type of data specified in the config.</span>
-<span class="sd">    If training is set to False, it loads the metadata from the files. If training is set to True, it gets the metadata from OpenML.</span>
-
-<span class="sd">    This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.</span>
-
-
-<span class="sd">    Input: config (dict) : The config dictionary</span>
-
-<span class="sd">    Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.</span>
-<span class="sd">    &quot;&quot;&quot;</span>
-
-    <span class="c1"># save_filename = f&quot;./data/all_{config[&#39;type_of_data&#39;]}_metadata.pkl&quot;</span>
-    <span class="c1"># use os.path.join to ensure compatibility with different operating systems</span>
-    <span class="n">save_filename</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span>
-        <span class="n">config</span><span class="p">[</span><span class="s2">&quot;data_dir&quot;</span><span class="p">],</span> <span class="sa">f</span><span class="s2">&quot;all_</span><span class="si">{</span><span class="n">config</span><span class="p">[</span><span class="s1">&#39;type_of_data&#39;</span><span class="p">]</span><span class="si">}</span><span class="s2">_metadata.pkl&quot;</span>
-    <span class="p">)</span>
-    <span class="c1"># If we are not training, we do not need to recreate the cache and can load the metadata from the files. If the files do not exist, raise an exception.</span>
-    <span class="c1"># TODO : Check if this behavior is correct, or if data does not exist, send to training pipeline?</span>
-    <span class="k">if</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;training&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="kc">False</span> <span class="ow">or</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;ignore_downloading_data&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="kc">True</span><span class="p">:</span>
-        <span class="c1"># print(&quot;[INFO] Training is set to False.&quot;)</span>
-        <span class="c1"># Check if the metadata files exist for all types of data</span>
-        <span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">save_filename</span><span class="p">):</span>
-            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span>
-                <span class="s2">&quot;Metadata files do not exist. Please run the training pipeline first.&quot;</span>
-            <span class="p">)</span>
-        <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;[INFO] Loading metadata from file.&quot;</span><span class="p">)</span>
-        <span class="c1"># Load the metadata files for all types of data</span>
-        <span class="k">return</span> <span class="n">load_metadata_from_file</span><span class="p">(</span><span class="n">save_filename</span><span class="p">)</span>
-
-    <span class="c1"># If we are training, we need to recreate the cache and get the metadata from OpenML</span>
-    <span class="k">if</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;training&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="kc">True</span><span class="p">:</span>
-        <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;[INFO] Training is set to True.&quot;</span><span class="p">)</span>
-        <span class="c1"># Gather all OpenML objects of the type of data</span>
-        <span class="n">all_objects</span> <span class="o">=</span> <span class="n">get_openml_objects</span><span class="p">(</span><span class="n">config</span><span class="p">[</span><span class="s2">&quot;type_of_data&quot;</span><span class="p">])</span>
-
-        <span class="c1"># subset the data for testing</span>
-        <span class="k">if</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;test_subset_2000&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="kc">True</span><span class="p">:</span>
-            <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;[INFO] Subsetting the data to 100 rows.&quot;</span><span class="p">)</span>
-            <span class="n">all_objects</span> <span class="o">=</span> <span class="n">all_objects</span><span class="p">[:</span><span class="mi">100</span><span class="p">]</span>
-
-        <span class="n">data_id</span> <span class="o">=</span> <span class="p">[</span><span class="nb">int</span><span class="p">(</span><span class="n">all_objects</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="n">i</span><span class="p">][</span><span class="s2">&quot;did&quot;</span><span class="p">])</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">all_objects</span><span class="p">))]</span>
-
-        <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;[INFO] Initializing cache.&quot;</span><span class="p">)</span>
-        <span class="n">initialize_cache</span><span class="p">(</span><span class="n">config</span><span class="p">[</span><span class="s2">&quot;type_of_data&quot;</span><span class="p">],</span> <span class="n">data_id</span><span class="p">)</span>
-
-        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;[INFO] Getting </span><span class="si">{</span><span class="n">config</span><span class="p">[</span><span class="s1">&#39;type_of_data&#39;</span><span class="p">]</span><span class="si">}</span><span class="s2"> metadata from OpenML.&quot;</span><span class="p">)</span>
-        <span class="n">openml_data_object</span> <span class="o">=</span> <span class="n">get_metadata_from_openml</span><span class="p">(</span><span class="n">config</span><span class="p">,</span> <span class="n">data_id</span><span class="p">)</span>
-
-        <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;[INFO] Saving metadata to file.&quot;</span><span class="p">)</span>
-        <span class="n">save_metadata_to_file</span><span class="p">((</span><span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_objects</span><span class="p">),</span> <span class="n">save_filename</span><span class="p">)</span>
-
-        <span class="k">return</span> <span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_objects</span>
-</code></pre></div></td></tr></table></div>
-            </details>
-    </div>
-
-</div>
-
-<div class="doc doc-object doc-function">
-
-
-<h2 id="metadata_utils.get_dataset_description" class="doc doc-heading">
-            <code class="highlight language-python"><span class="n">get_dataset_description</span><span class="p">(</span><span class="n">dataset_id</span><span class="p">)</span></code>
-
-<a href="#metadata_utils.get_dataset_description" class="headerlink" title="Permanent link">&para;</a></h2>
-
-
-    <div class="doc doc-contents ">
-
-      <p>Get the dataset description from OpenML using the dataset id</p>
-<p>Input: dataset_id (int) : The dataset id</p>
-<p>Returns: data (openml.datasets.dataset.OpenMLDataset) : The dataset object from OpenML</p>
-
-            <details class="quote">
-              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">14</span>
-<span class="normal">15</span>
-<span class="normal">16</span>
-<span class="normal">17</span>
-<span class="normal">18</span>
-<span class="normal">19</span>
-<span class="normal">20</span>
-<span class="normal">21</span>
-<span class="normal">22</span>
-<span class="normal">23</span>
-<span class="normal">24</span>
-<span class="normal">25</span>
-<span class="normal">26</span>
-<span class="normal">27</span>
-<span class="normal">28</span>
-<span class="normal">29</span>
-<span class="normal">30</span>
-<span class="normal">31</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_dataset_description</span><span class="p">(</span><span class="n">dataset_id</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">openml</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">dataset</span><span class="o">.</span><span class="n">OpenMLDataset</span><span class="p">:</span>
-<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
-<span class="sd">    Get the dataset description from OpenML using the dataset id</span>
-
-<span class="sd">    Input: dataset_id (int) : The dataset id</span>
-
-<span class="sd">    Returns: data (openml.datasets.dataset.OpenMLDataset) : The dataset object from OpenML</span>
-<span class="sd">    &quot;&quot;&quot;</span>
-    <span class="c1"># TODO : Check for objects that do not have qualities being not downloaded properly</span>
-    <span class="c1"># try:</span>
-    <span class="n">data</span> <span class="o">=</span> <span class="n">openml</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">get_dataset</span><span class="p">(</span>
-        <span class="n">dataset_id</span><span class="o">=</span><span class="n">dataset_id</span><span class="p">,</span>
-        <span class="n">download_data</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
-        <span class="n">download_qualities</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
-        <span class="n">download_features_meta_data</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
-    <span class="p">)</span>
-
-    <span class="k">return</span> <span class="n">data</span>
-</code></pre></div></td></tr></table></div>
-            </details>
-    </div>
-
-</div>
-
-<div class="doc doc-object doc-function">
-
-
-<h2 id="metadata_utils.get_flow_description" class="doc doc-heading">
-            <code class="highlight language-python"><span class="n">get_flow_description</span><span class="p">(</span><span class="n">flow_id</span><span class="p">)</span></code>
-
-<a href="#metadata_utils.get_flow_description" class="headerlink" title="Permanent link">&para;</a></h2>
-
-
-    <div class="doc doc-contents ">
-
-      <p>Get the flow description from OpenML using the flow id</p>
-<p>Input: flow_id (int) : The flow id</p>
-<p>Returns: data (openml.flows.flow.OpenMLFlow) : The flow object from OpenML</p>
-
-            <details class="quote">
-              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">34</span>
-<span class="normal">35</span>
-<span class="normal">36</span>
-<span class="normal">37</span>
-<span class="normal">38</span>
-<span class="normal">39</span>
-<span class="normal">40</span>
-<span class="normal">41</span>
-<span class="normal">42</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_flow_description</span><span class="p">(</span><span class="n">flow_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">openml</span><span class="o">.</span><span class="n">flows</span><span class="o">.</span><span class="n">flow</span><span class="o">.</span><span class="n">OpenMLFlow</span><span class="p">:</span>
-<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
-<span class="sd">    Get the flow description from OpenML using the flow id</span>
-
-<span class="sd">    Input: flow_id (int) : The flow id</span>
-
-<span class="sd">    Returns: data (openml.flows.flow.OpenMLFlow) : The flow object from OpenML</span>
-<span class="sd">    &quot;&quot;&quot;</span>
-    <span class="k">return</span> <span class="n">openml</span><span class="o">.</span><span class="n">flows</span><span class="o">.</span><span class="n">get_flow</span><span class="p">(</span><span class="n">flow_id</span><span class="o">=</span><span class="n">flow_id</span><span class="p">)</span>
-</code></pre></div></td></tr></table></div>
-            </details>
-    </div>
-
-</div>
-
-<div class="doc doc-object doc-function">
-
-
-<h2 id="metadata_utils.get_metadata_from_openml" class="doc doc-heading">
-            <code class="highlight language-python"><span class="n">get_metadata_from_openml</span><span class="p">(</span><span class="n">config</span><span class="p">,</span> <span class="n">data_id</span><span class="p">)</span></code>
-
-<a href="#metadata_utils.get_metadata_from_openml" class="headerlink" title="Permanent link">&para;</a></h2>
-
-
-    <div class="doc doc-contents ">
-
-      <p>Get metadata from OpenML using parallel processing.</p>
-
-            <details class="quote">
-              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">71</span>
-<span class="normal">72</span>
-<span class="normal">73</span>
-<span class="normal">74</span>
-<span class="normal">75</span>
-<span class="normal">76</span>
-<span class="normal">77</span>
-<span class="normal">78</span>
-<span class="normal">79</span>
-<span class="normal">80</span>
-<span class="normal">81</span>
-<span class="normal">82</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_metadata_from_openml</span><span class="p">(</span><span class="n">config</span><span class="p">,</span> <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">]):</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">211</span>
+<span class="normal">212</span>
+<span class="normal">213</span>
+<span class="normal">214</span>
+<span class="normal">215</span>
+<span class="normal">216</span>
+<span class="normal">217</span>
+<span class="normal">218</span>
+<span class="normal">219</span>
+<span class="normal">220</span>
+<span class="normal">221</span>
+<span class="normal">222</span>
+<span class="normal">223</span>
+<span class="normal">224</span>
+<span class="normal">225</span>
+<span class="normal">226</span>
+<span class="normal">227</span>
+<span class="normal">228</span>
+<span class="normal">229</span>
+<span class="normal">230</span>
+<span class="normal">231</span>
+<span class="normal">232</span>
+<span class="normal">233</span>
+<span class="normal">234</span>
+<span class="normal">235</span>
+<span class="normal">236</span>
+<span class="normal">237</span>
+<span class="normal">238</span>
+<span class="normal">239</span>
+<span class="normal">240</span>
+<span class="normal">241</span>
+<span class="normal">242</span>
+<span class="normal">243</span>
+<span class="normal">244</span>
+<span class="normal">245</span>
+<span class="normal">246</span>
+<span class="normal">247</span>
+<span class="normal">248</span>
+<span class="normal">249</span>
+<span class="normal">250</span>
+<span class="normal">251</span>
+<span class="normal">252</span>
+<span class="normal">253</span>
+<span class="normal">254</span>
+<span class="normal">255</span>
+<span class="normal">256</span>
+<span class="normal">257</span>
+<span class="normal">258</span>
+<span class="normal">259</span>
+<span class="normal">260</span>
+<span class="normal">261</span>
+<span class="normal">262</span>
+<span class="normal">263</span>
+<span class="normal">264</span>
+<span class="normal">265</span>
+<span class="normal">266</span>
+<span class="normal">267</span>
+<span class="normal">268</span>
+<span class="normal">269</span>
+<span class="normal">270</span>
+<span class="normal">271</span>
+<span class="normal">272</span>
+<span class="normal">273</span>
+<span class="normal">274</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_all_metadata_from_openml</span><span class="p">(</span>
+    <span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
-<span class="sd">    Get metadata from OpenML using parallel processing.</span>
-<span class="sd">    &quot;&quot;&quot;</span>
-    <span class="k">if</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;type_of_data&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;dataset&quot;</span><span class="p">:</span>
-        <span class="k">return</span> <span class="n">pqdm</span><span class="p">(</span>
-            <span class="n">data_id</span><span class="p">,</span> <span class="n">get_dataset_description</span><span class="p">,</span> <span class="n">n_jobs</span><span class="o">=</span><span class="n">config</span><span class="p">[</span><span class="s2">&quot;data_download_n_jobs&quot;</span><span class="p">]</span>
-        <span class="p">)</span>
-    <span class="k">elif</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;type_of_data&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;flow&quot;</span><span class="p">:</span>
-        <span class="k">return</span> <span class="n">pqdm</span><span class="p">(</span>
-            <span class="n">data_id</span><span class="p">,</span> <span class="n">get_flow_description</span><span class="p">,</span> <span class="n">n_jobs</span><span class="o">=</span><span class="n">config</span><span class="p">[</span><span class="s2">&quot;data_download_n_jobs&quot;</span><span class="p">]</span>
-        <span class="p">)</span>
-</code></pre></div></td></tr></table></div>
-            </details>
-    </div>
-
-</div>
-
-<div class="doc doc-object doc-function">
-
-
-<h2 id="metadata_utils.get_openml_objects" class="doc doc-heading">
-            <code class="highlight language-python"><span class="n">get_openml_objects</span><span class="p">(</span><span class="n">type_of_data</span><span class="p">)</span></code>
-
-<a href="#metadata_utils.get_openml_objects" class="headerlink" title="Permanent link">&para;</a></h2>
+<span class="sd">    Description: Gets all the metadata from OpenML for the type of data specified in the config.</span>
+<span class="sd">    If training is set to False, it loads the metadata from the files. If training is set to True, it gets the metadata from OpenML.</span>
 
+<span class="sd">    This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.</span>
 
-    <div class="doc doc-contents ">
 
-      <p>Get OpenML objects based on the type of data.</p>
+<span class="sd">    Input: config (dict) : The config dictionary</span>
 
-            <details class="quote">
-              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">85</span>
-<span class="normal">86</span>
-<span class="normal">87</span>
-<span class="normal">88</span>
-<span class="normal">89</span>
-<span class="normal">90</span>
-<span class="normal">91</span>
-<span class="normal">92</span>
-<span class="normal">93</span>
-<span class="normal">94</span>
-<span class="normal">95</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_openml_objects</span><span class="p">(</span><span class="n">type_of_data</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
-<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
-<span class="sd">    Get OpenML objects based on the type of data.</span>
+<span class="sd">    Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.</span>
 <span class="sd">    &quot;&quot;&quot;</span>
-    <span class="k">if</span> <span class="n">type_of_data</span> <span class="o">==</span> <span class="s2">&quot;dataset&quot;</span><span class="p">:</span>
-        <span class="k">return</span> <span class="n">openml</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">list_datasets</span><span class="p">(</span><span class="n">output_format</span><span class="o">=</span><span class="s2">&quot;dataframe&quot;</span><span class="p">)</span>
-    <span class="k">elif</span> <span class="n">type_of_data</span> <span class="o">==</span> <span class="s2">&quot;flow&quot;</span><span class="p">:</span>
-        <span class="n">all_objects</span> <span class="o">=</span> <span class="n">openml</span><span class="o">.</span><span class="n">flows</span><span class="o">.</span><span class="n">list_flows</span><span class="p">(</span><span class="n">output_format</span><span class="o">=</span><span class="s2">&quot;dataframe&quot;</span><span class="p">)</span>
-        <span class="k">return</span> <span class="n">all_objects</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;id&quot;</span><span class="p">:</span> <span class="s2">&quot;did&quot;</span><span class="p">})</span>
-    <span class="k">else</span><span class="p">:</span>
-        <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Invalid type_of_data specified&quot;</span><span class="p">)</span>
-</code></pre></div></td></tr></table></div>
-            </details>
-    </div>
 
-</div>
+    <span class="c1"># save_filename = f&quot;./data/all_{config[&#39;type_of_data&#39;]}_metadata.pkl&quot;</span>
+    <span class="c1"># use os.path.join to ensure compatibility with different operating systems</span>
+    <span class="n">save_filename</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span>
+        <span class="n">config</span><span class="p">[</span><span class="s2">&quot;data_dir&quot;</span><span class="p">],</span> <span class="sa">f</span><span class="s2">&quot;all_</span><span class="si">{</span><span class="n">config</span><span class="p">[</span><span class="s1">&#39;type_of_data&#39;</span><span class="p">]</span><span class="si">}</span><span class="s2">_metadata.pkl&quot;</span>
+    <span class="p">)</span>
+    <span class="c1"># If we are not training, we do not need to recreate the cache and can load the metadata from the files. If the files do not exist, raise an exception.</span>
+    <span class="c1"># TODO : Check if this behavior is correct, or if data does not exist, send to training pipeline?</span>
+    <span class="k">if</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;training&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="kc">False</span> <span class="ow">or</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;ignore_downloading_data&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="kc">True</span><span class="p">:</span>
+        <span class="c1"># print(&quot;[INFO] Training is set to False.&quot;)</span>
+        <span class="c1"># Check if the metadata files exist for all types of data</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">save_filename</span><span class="p">):</span>
+            <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span>
+                <span class="s2">&quot;Metadata files do not exist. Please run the training pipeline first.&quot;</span>
+            <span class="p">)</span>
+        <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;[INFO] Loading metadata from file.&quot;</span><span class="p">)</span>
+        <span class="c1"># Load the metadata files for all types of data</span>
+        <span class="k">return</span> <span class="n">load_metadata_from_file</span><span class="p">(</span><span class="n">save_filename</span><span class="p">)</span>
 
-<div class="doc doc-object doc-function">
+    <span class="c1"># If we are training, we need to recreate the cache and get the metadata from OpenML</span>
+    <span class="k">if</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;training&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="kc">True</span><span class="p">:</span>
+        <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;[INFO] Training is set to True.&quot;</span><span class="p">)</span>
+        <span class="c1"># Gather all OpenML objects of the type of data</span>
+        <span class="n">handler</span> <span class="o">=</span> <span class="p">(</span>
+            <span class="n">OpenMLDatasetHandler</span><span class="p">(</span><span class="n">config</span><span class="p">)</span>
+            <span class="k">if</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;type_of_data&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;dataset&quot;</span>
+            <span class="k">else</span> <span class="n">OpenMLFlowHandler</span><span class="p">(</span><span class="n">config</span><span class="p">)</span>
+        <span class="p">)</span>
 
+        <span class="n">all_objects</span> <span class="o">=</span> <span class="n">handler</span><span class="o">.</span><span class="n">get_openml_objects</span><span class="p">()</span>
 
-<h2 id="metadata_utils.initialize_cache" class="doc doc-heading">
-            <code class="highlight language-python"><span class="n">initialize_cache</span><span class="p">(</span><span class="n">type_of_data</span><span class="p">,</span> <span class="n">data_id</span><span class="p">)</span></code>
+        <span class="c1"># subset the data for testing</span>
+        <span class="k">if</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;test_subset_2000&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="kc">True</span><span class="p">:</span>
+            <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;[INFO] Subsetting the data to 100 rows.&quot;</span><span class="p">)</span>
+            <span class="n">all_objects</span> <span class="o">=</span> <span class="n">all_objects</span><span class="p">[:</span><span class="mi">500</span><span class="p">]</span>
 
-<a href="#metadata_utils.initialize_cache" class="headerlink" title="Permanent link">&para;</a></h2>
+        <span class="n">data_id</span> <span class="o">=</span> <span class="p">[</span><span class="nb">int</span><span class="p">(</span><span class="n">all_objects</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="n">i</span><span class="p">][</span><span class="s2">&quot;did&quot;</span><span class="p">])</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">all_objects</span><span class="p">))]</span>
 
+        <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;[INFO] Initializing cache.&quot;</span><span class="p">)</span>
+        <span class="n">handler</span><span class="o">.</span><span class="n">initialize_cache</span><span class="p">(</span><span class="n">data_id</span><span class="p">)</span>
 
-    <div class="doc doc-contents ">
+        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;[INFO] Getting </span><span class="si">{</span><span class="n">config</span><span class="p">[</span><span class="s1">&#39;type_of_data&#39;</span><span class="p">]</span><span class="si">}</span><span class="s2"> metadata from OpenML.&quot;</span><span class="p">)</span>
+        <span class="n">openml_data_object</span> <span class="o">=</span> <span class="n">handler</span><span class="o">.</span><span class="n">get_metadata</span><span class="p">(</span><span class="n">data_id</span><span class="p">)</span>
 
-      <p>Initialize cache for the OpenML objects.</p>
+        <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;[INFO] Saving metadata to file.&quot;</span><span class="p">)</span>
+        <span class="n">save_metadata_to_file</span><span class="p">(</span>
+            <span class="p">(</span><span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_objects</span><span class="p">,</span> <span class="n">handler</span><span class="p">),</span> <span class="n">save_filename</span>
+        <span class="p">)</span>
 
-            <details class="quote">
-              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">61</span>
-<span class="normal">62</span>
-<span class="normal">63</span>
-<span class="normal">64</span>
-<span class="normal">65</span>
-<span class="normal">66</span>
-<span class="normal">67</span>
-<span class="normal">68</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">initialize_cache</span><span class="p">(</span><span class="n">type_of_data</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
-<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
-<span class="sd">    Initialize cache for the OpenML objects.</span>
-<span class="sd">    &quot;&quot;&quot;</span>
-    <span class="k">if</span> <span class="n">type_of_data</span> <span class="o">==</span> <span class="s2">&quot;dataset&quot;</span><span class="p">:</span>
-        <span class="n">get_dataset_description</span><span class="p">(</span><span class="n">data_id</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
-    <span class="k">elif</span> <span class="n">type_of_data</span> <span class="o">==</span> <span class="s2">&quot;flow&quot;</span><span class="p">:</span>
-        <span class="n">get_flow_description</span><span class="p">(</span><span class="n">data_id</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
+        <span class="k">return</span> <span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_objects</span><span class="p">,</span> <span class="n">handler</span>
 </code></pre></div></td></tr></table></div>
             </details>
     </div>
@@ -1765,21 +2313,21 @@ <h2 id="metadata_utils.join_attributes" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">170</span>
-<span class="normal">171</span>
-<span class="normal">172</span>
-<span class="normal">173</span>
-<span class="normal">174</span>
-<span class="normal">175</span>
-<span class="normal">176</span>
-<span class="normal">177</span>
-<span class="normal">178</span>
-<span class="normal">179</span>
-<span class="normal">180</span>
-<span class="normal">181</span>
-<span class="normal">182</span>
-<span class="normal">183</span>
-<span class="normal">184</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">join_attributes</span><span class="p">(</span><span class="n">attribute</span><span class="p">:</span> <span class="nb">object</span><span class="p">,</span> <span class="n">attr_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">291</span>
+<span class="normal">292</span>
+<span class="normal">293</span>
+<span class="normal">294</span>
+<span class="normal">295</span>
+<span class="normal">296</span>
+<span class="normal">297</span>
+<span class="normal">298</span>
+<span class="normal">299</span>
+<span class="normal">300</span>
+<span class="normal">301</span>
+<span class="normal">302</span>
+<span class="normal">303</span>
+<span class="normal">304</span>
+<span class="normal">305</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">join_attributes</span><span class="p">(</span><span class="n">attribute</span><span class="p">:</span> <span class="nb">object</span><span class="p">,</span> <span class="n">attr_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Join the attributes of the OpenML object.</span>
 
@@ -1815,12 +2363,16 @@ <h2 id="metadata_utils.load_metadata_from_file" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">45</span>
-<span class="normal">46</span>
-<span class="normal">47</span>
-<span class="normal">48</span>
-<span class="normal">49</span>
-<span class="normal">50</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">load_metadata_from_file</span><span class="p">(</span><span class="n">save_filename</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]:</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">15</span>
+<span class="normal">16</span>
+<span class="normal">17</span>
+<span class="normal">18</span>
+<span class="normal">19</span>
+<span class="normal">20</span>
+<span class="normal">21</span>
+<span class="normal">22</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">load_metadata_from_file</span><span class="p">(</span>
+    <span class="n">save_filename</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Load metadata from a file.</span>
 <span class="sd">    &quot;&quot;&quot;</span>
@@ -1849,16 +2401,16 @@ <h2 id="metadata_utils.merge_all_columns_to_string" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">208</span>
-<span class="normal">209</span>
-<span class="normal">210</span>
-<span class="normal">211</span>
-<span class="normal">212</span>
-<span class="normal">213</span>
-<span class="normal">214</span>
-<span class="normal">215</span>
-<span class="normal">216</span>
-<span class="normal">217</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">merge_all_columns_to_string</span><span class="p">(</span><span class="n">row</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">332</span>
+<span class="normal">333</span>
+<span class="normal">334</span>
+<span class="normal">335</span>
+<span class="normal">336</span>
+<span class="normal">337</span>
+<span class="normal">338</span>
+<span class="normal">339</span>
+<span class="normal">340</span>
+<span class="normal">341</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">merge_all_columns_to_string</span><span class="p">(</span><span class="n">row</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Create a single column that has a combined string of all the metadata and the description in the form of &quot;column - value, column - value, ... description&quot;</span>
 
@@ -1877,176 +2429,6 @@ <h2 id="metadata_utils.merge_all_columns_to_string" class="doc doc-heading">
 <div class="doc doc-object doc-function">
 
 
-<h2 id="metadata_utils.process_dataset_metadata" class="doc doc-heading">
-            <code class="highlight language-python"><span class="n">process_dataset_metadata</span><span class="p">(</span><span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">all_dataset_metadata</span><span class="p">,</span> <span class="n">file_path</span><span class="p">)</span></code>
-
-<a href="#metadata_utils.process_dataset_metadata" class="headerlink" title="Permanent link">&para;</a></h2>
-
-
-    <div class="doc doc-contents ">
-
-      <p>Description: Process the dataset metadata.</p>
-<p>Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path</p>
-<p>Returns: The combined metadata dataframe and the updated metadata table.</p>
-
-            <details class="quote">
-              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">252</span>
-<span class="normal">253</span>
-<span class="normal">254</span>
-<span class="normal">255</span>
-<span class="normal">256</span>
-<span class="normal">257</span>
-<span class="normal">258</span>
-<span class="normal">259</span>
-<span class="normal">260</span>
-<span class="normal">261</span>
-<span class="normal">262</span>
-<span class="normal">263</span>
-<span class="normal">264</span>
-<span class="normal">265</span>
-<span class="normal">266</span>
-<span class="normal">267</span>
-<span class="normal">268</span>
-<span class="normal">269</span>
-<span class="normal">270</span>
-<span class="normal">271</span>
-<span class="normal">272</span>
-<span class="normal">273</span>
-<span class="normal">274</span>
-<span class="normal">275</span>
-<span class="normal">276</span>
-<span class="normal">277</span>
-<span class="normal">278</span>
-<span class="normal">279</span>
-<span class="normal">280</span>
-<span class="normal">281</span>
-<span class="normal">282</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">process_dataset_metadata</span><span class="p">(</span>
-    <span class="n">openml_data_object</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">openml</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">dataset</span><span class="o">.</span><span class="n">OpenMLDataset</span><span class="p">],</span> <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">all_dataset_metadata</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">file_path</span><span class="p">:</span> <span class="nb">str</span>
-<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]:</span>
-<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
-<span class="sd">    Description: Process the dataset metadata.</span>
-
-<span class="sd">    Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path</span>
-
-<span class="sd">    Returns: The combined metadata dataframe and the updated metadata table.</span>
-<span class="sd">    &quot;&quot;&quot;</span>
-    <span class="n">descriptions</span> <span class="o">=</span> <span class="p">[</span>
-        <span class="n">extract_attribute</span><span class="p">(</span><span class="n">attr</span><span class="p">,</span> <span class="s2">&quot;description&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="n">openml_data_object</span>
-    <span class="p">]</span>
-    <span class="n">joined_qualities</span> <span class="o">=</span> <span class="p">[</span>
-        <span class="n">join_attributes</span><span class="p">(</span><span class="n">attr</span><span class="p">,</span> <span class="s2">&quot;qualities&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="n">openml_data_object</span>
-    <span class="p">]</span>
-    <span class="n">joined_features</span> <span class="o">=</span> <span class="p">[</span><span class="n">join_attributes</span><span class="p">(</span><span class="n">attr</span><span class="p">,</span> <span class="s2">&quot;features&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="n">openml_data_object</span><span class="p">]</span>
-
-    <span class="n">all_data_description_df</span> <span class="o">=</span> <span class="n">create_combined_information_df</span><span class="p">(</span>
-        <span class="n">data_id</span><span class="p">,</span> <span class="n">descriptions</span><span class="p">,</span> <span class="n">joined_qualities</span><span class="p">,</span> <span class="n">joined_features</span>
-    <span class="p">)</span>
-    <span class="n">all_dataset_metadata</span> <span class="o">=</span> <span class="n">combine_metadata</span><span class="p">(</span>
-        <span class="n">all_dataset_metadata</span><span class="p">,</span> <span class="n">all_data_description_df</span>
-    <span class="p">)</span>
-
-    <span class="n">all_dataset_metadata</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span>
-
-    <span class="k">return</span> <span class="p">(</span>
-        <span class="n">all_dataset_metadata</span><span class="p">[[</span><span class="s2">&quot;did&quot;</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">,</span> <span class="s2">&quot;Combined_information&quot;</span><span class="p">]],</span>
-        <span class="n">all_dataset_metadata</span><span class="p">,</span>
-    <span class="p">)</span>
-</code></pre></div></td></tr></table></div>
-            </details>
-    </div>
-
-</div>
-
-<div class="doc doc-object doc-function">
-
-
-<h2 id="metadata_utils.process_flow_metadata" class="doc doc-heading">
-            <code class="highlight language-python"><span class="n">process_flow_metadata</span><span class="p">(</span><span class="n">openml_data_object</span><span class="p">,</span> <span class="n">data_id</span><span class="p">,</span> <span class="n">file_path</span><span class="p">)</span></code>
-
-<a href="#metadata_utils.process_flow_metadata" class="headerlink" title="Permanent link">&para;</a></h2>
-
-
-    <div class="doc doc-contents ">
-
-      <p>Description: Process the flow metadata.</p>
-<p>Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, file_path (str) : The file path</p>
-<p>Returns: The combined metadata dataframe and the updated metadata table.</p>
-
-            <details class="quote">
-              <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">286</span>
-<span class="normal">287</span>
-<span class="normal">288</span>
-<span class="normal">289</span>
-<span class="normal">290</span>
-<span class="normal">291</span>
-<span class="normal">292</span>
-<span class="normal">293</span>
-<span class="normal">294</span>
-<span class="normal">295</span>
-<span class="normal">296</span>
-<span class="normal">297</span>
-<span class="normal">298</span>
-<span class="normal">299</span>
-<span class="normal">300</span>
-<span class="normal">301</span>
-<span class="normal">302</span>
-<span class="normal">303</span>
-<span class="normal">304</span>
-<span class="normal">305</span>
-<span class="normal">306</span>
-<span class="normal">307</span>
-<span class="normal">308</span>
-<span class="normal">309</span>
-<span class="normal">310</span>
-<span class="normal">311</span>
-<span class="normal">312</span>
-<span class="normal">313</span>
-<span class="normal">314</span>
-<span class="normal">315</span>
-<span class="normal">316</span>
-<span class="normal">317</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">process_flow_metadata</span><span class="p">(</span><span class="n">openml_data_object</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">openml</span><span class="o">.</span><span class="n">flows</span><span class="o">.</span><span class="n">flow</span><span class="o">.</span><span class="n">OpenMLFlow</span><span class="p">],</span> <span class="n">data_id</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">file_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]:</span>
-<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
-<span class="sd">    Description: Process the flow metadata.</span>
-
-<span class="sd">    Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, file_path (str) : The file path</span>
-
-<span class="sd">    Returns: The combined metadata dataframe and the updated metadata table.</span>
-<span class="sd">    &quot;&quot;&quot;</span>
-    <span class="n">descriptions</span> <span class="o">=</span> <span class="p">[</span>
-        <span class="n">extract_attribute</span><span class="p">(</span><span class="n">attr</span><span class="p">,</span> <span class="s2">&quot;description&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="n">openml_data_object</span>
-    <span class="p">]</span>
-    <span class="n">names</span> <span class="o">=</span> <span class="p">[</span><span class="n">extract_attribute</span><span class="p">(</span><span class="n">attr</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="n">openml_data_object</span><span class="p">]</span>
-    <span class="n">tags</span> <span class="o">=</span> <span class="p">[</span><span class="n">extract_attribute</span><span class="p">(</span><span class="n">attr</span><span class="p">,</span> <span class="s2">&quot;tags&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="n">openml_data_object</span><span class="p">]</span>
-
-    <span class="n">all_data_description_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span>
-        <span class="p">{</span>
-            <span class="s2">&quot;did&quot;</span><span class="p">:</span> <span class="n">data_id</span><span class="p">,</span>
-            <span class="s2">&quot;description&quot;</span><span class="p">:</span> <span class="n">descriptions</span><span class="p">,</span>
-            <span class="s2">&quot;name&quot;</span><span class="p">:</span> <span class="n">names</span><span class="p">,</span>
-            <span class="s2">&quot;tags&quot;</span><span class="p">:</span> <span class="n">tags</span><span class="p">,</span>
-        <span class="p">}</span>
-    <span class="p">)</span>
-
-    <span class="n">all_data_description_df</span><span class="p">[</span><span class="s2">&quot;Combined_information&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">all_data_description_df</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span>
-        <span class="n">merge_all_columns_to_string</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span>
-    <span class="p">)</span>
-    <span class="n">all_data_description_df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span>
-
-    <span class="k">return</span> <span class="p">(</span>
-        <span class="n">all_data_description_df</span><span class="p">[[</span><span class="s2">&quot;did&quot;</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">,</span> <span class="s2">&quot;Combined_information&quot;</span><span class="p">]],</span>
-        <span class="n">all_data_description_df</span><span class="p">,</span>
-    <span class="p">)</span>
-</code></pre></div></td></tr></table></div>
-            </details>
-    </div>
-
-</div>
-
-<div class="doc doc-object doc-function">
-
-
 <h2 id="metadata_utils.save_metadata_to_file" class="doc doc-heading">
             <code class="highlight language-python"><span class="n">save_metadata_to_file</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">save_filename</span><span class="p">)</span></code>
 
@@ -2059,12 +2441,12 @@ <h2 id="metadata_utils.save_metadata_to_file" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/metadata_utils.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">53</span>
-<span class="normal">54</span>
-<span class="normal">55</span>
-<span class="normal">56</span>
-<span class="normal">57</span>
-<span class="normal">58</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">save_metadata_to_file</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">save_filename</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">25</span>
+<span class="normal">26</span>
+<span class="normal">27</span>
+<span class="normal">28</span>
+<span class="normal">29</span>
+<span class="normal">30</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">save_metadata_to_file</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">save_filename</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Save metadata to a file.</span>
 <span class="sd">    &quot;&quot;&quot;</span>
diff --git a/site/modules/result_gen/index.html b/site/modules/result_gen/index.html
index 4c451f0..23062a4 100644
--- a/site/modules/result_gen/index.html
+++ b/site/modules/result_gen/index.html
@@ -276,6 +276,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../testing/" class="md-nav__link">
         
@@ -324,10 +344,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -338,8 +358,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -515,10 +535,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" checked>
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" checked>
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -529,8 +549,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="true">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
@@ -907,19 +927,7 @@ <h2 id="results_gen.aggregate_multiple_queries_and_count" class="doc doc-heading
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/results_gen.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">191</span>
-<span class="normal">192</span>
-<span class="normal">193</span>
-<span class="normal">194</span>
-<span class="normal">195</span>
-<span class="normal">196</span>
-<span class="normal">197</span>
-<span class="normal">198</span>
-<span class="normal">199</span>
-<span class="normal">200</span>
-<span class="normal">201</span>
-<span class="normal">202</span>
-<span class="normal">203</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">203</span>
 <span class="normal">204</span>
 <span class="normal">205</span>
 <span class="normal">206</span>
@@ -936,8 +944,20 @@ <h2 id="results_gen.aggregate_multiple_queries_and_count" class="doc doc-heading
 <span class="normal">217</span>
 <span class="normal">218</span>
 <span class="normal">219</span>
-<span class="normal">220</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">aggregate_multiple_queries_and_count</span><span class="p">(</span>
-    <span class="n">queries</span><span class="p">,</span> <span class="n">qa_dataset</span><span class="p">,</span> <span class="n">config</span><span class="p">,</span> <span class="n">group_cols</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">],</span> <span class="n">sort_by</span><span class="o">=</span><span class="s2">&quot;query&quot;</span><span class="p">,</span> <span class="n">count</span> <span class="o">=</span> <span class="kc">True</span>
+<span class="normal">220</span>
+<span class="normal">221</span>
+<span class="normal">222</span>
+<span class="normal">223</span>
+<span class="normal">224</span>
+<span class="normal">225</span>
+<span class="normal">226</span>
+<span class="normal">227</span>
+<span class="normal">228</span>
+<span class="normal">229</span>
+<span class="normal">230</span>
+<span class="normal">231</span>
+<span class="normal">232</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">aggregate_multiple_queries_and_count</span><span class="p">(</span>
+    <span class="n">queries</span><span class="p">,</span> <span class="n">qa_dataset</span><span class="p">,</span> <span class="n">config</span><span class="p">,</span> <span class="n">group_cols</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;name&quot;</span><span class="p">],</span> <span class="n">sort_by</span><span class="o">=</span><span class="s2">&quot;query&quot;</span><span class="p">,</span> <span class="n">count</span><span class="o">=</span><span class="kc">True</span>
 <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Aggregate the results of multiple queries into a single dataframe and count the number of times a dataset appears in the results</span>
@@ -959,11 +979,11 @@ <h2 id="results_gen.aggregate_multiple_queries_and_count" class="doc doc-heading
         <span class="n">combined_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">combined_df</span><span class="p">,</span> <span class="n">result_data_frame</span><span class="p">])</span>
     <span class="k">if</span> <span class="n">count</span><span class="p">:</span>
         <span class="n">combined_df</span> <span class="o">=</span> <span class="p">(</span>
-        <span class="n">combined_df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">group_cols</span><span class="p">)</span>
-        <span class="o">.</span><span class="n">count</span><span class="p">()</span>
-        <span class="o">.</span><span class="n">reset_index</span><span class="p">()</span>
-        <span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="n">sort_by</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
-    <span class="p">)</span>
+            <span class="n">combined_df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">group_cols</span><span class="p">)</span>
+            <span class="o">.</span><span class="n">count</span><span class="p">()</span>
+            <span class="o">.</span><span class="n">reset_index</span><span class="p">()</span>
+            <span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="n">sort_by</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+        <span class="p">)</span>
 
     <span class="k">return</span> <span class="n">combined_df</span>
 </code></pre></div></td></tr></table></div>
@@ -992,17 +1012,7 @@ <h2 id="results_gen.check_query" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/results_gen.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">139</span>
-<span class="normal">140</span>
-<span class="normal">141</span>
-<span class="normal">142</span>
-<span class="normal">143</span>
-<span class="normal">144</span>
-<span class="normal">145</span>
-<span class="normal">146</span>
-<span class="normal">147</span>
-<span class="normal">148</span>
-<span class="normal">149</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">149</span>
 <span class="normal">150</span>
 <span class="normal">151</span>
 <span class="normal">152</span>
@@ -1012,7 +1022,17 @@ <h2 id="results_gen.check_query" class="doc doc-heading">
 <span class="normal">156</span>
 <span class="normal">157</span>
 <span class="normal">158</span>
-<span class="normal">159</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">check_query</span><span class="p">(</span><span class="n">query</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+<span class="normal">159</span>
+<span class="normal">160</span>
+<span class="normal">161</span>
+<span class="normal">162</span>
+<span class="normal">163</span>
+<span class="normal">164</span>
+<span class="normal">165</span>
+<span class="normal">166</span>
+<span class="normal">167</span>
+<span class="normal">168</span>
+<span class="normal">169</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">check_query</span><span class="p">(</span><span class="n">query</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Performs checks on the query</span>
 <span class="sd">    - Replaces %20 with space character (browsers do this automatically when spaces are in the URL)</span>
@@ -1056,15 +1076,7 @@ <h2 id="results_gen.create_output_dataframe" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/results_gen.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">101</span>
-<span class="normal">102</span>
-<span class="normal">103</span>
-<span class="normal">104</span>
-<span class="normal">105</span>
-<span class="normal">106</span>
-<span class="normal">107</span>
-<span class="normal">108</span>
-<span class="normal">109</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">109</span>
 <span class="normal">110</span>
 <span class="normal">111</span>
 <span class="normal">112</span>
@@ -1091,7 +1103,19 @@ <h2 id="results_gen.create_output_dataframe" class="doc doc-heading">
 <span class="normal">133</span>
 <span class="normal">134</span>
 <span class="normal">135</span>
-<span class="normal">136</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">create_output_dataframe</span><span class="p">(</span><span class="n">dict_results</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">type_of_data</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">ids_order</span><span class="p">:</span> <span class="nb">list</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
+<span class="normal">136</span>
+<span class="normal">137</span>
+<span class="normal">138</span>
+<span class="normal">139</span>
+<span class="normal">140</span>
+<span class="normal">141</span>
+<span class="normal">142</span>
+<span class="normal">143</span>
+<span class="normal">144</span>
+<span class="normal">145</span>
+<span class="normal">146</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">create_output_dataframe</span><span class="p">(</span>
+    <span class="n">dict_results</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">type_of_data</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">ids_order</span><span class="p">:</span> <span class="nb">list</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Create an output dataframe with the results. The URLs are API calls to the OpenML API for the specific type of data.</span>
 
@@ -1150,10 +1174,7 @@ <h2 id="results_gen.fetch_results" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/results_gen.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">33</span>
-<span class="normal">34</span>
-<span class="normal">35</span>
-<span class="normal">36</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">36</span>
 <span class="normal">37</span>
 <span class="normal">38</span>
 <span class="normal">39</span>
@@ -1190,7 +1211,20 @@ <h2 id="results_gen.fetch_results" class="doc doc-heading">
 <span class="normal">70</span>
 <span class="normal">71</span>
 <span class="normal">72</span>
-<span class="normal">73</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fetch_results</span><span class="p">(</span><span class="n">query</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">qa</span><span class="p">:</span> <span class="n">langchain</span><span class="o">.</span><span class="n">chains</span><span class="o">.</span><span class="n">retrieval_qa</span><span class="o">.</span><span class="n">base</span><span class="o">.</span><span class="n">RetrievalQA</span><span class="p">,</span> <span class="n">type_of_query</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Document</span><span class="p">]:</span>
+<span class="normal">73</span>
+<span class="normal">74</span>
+<span class="normal">75</span>
+<span class="normal">76</span>
+<span class="normal">77</span>
+<span class="normal">78</span>
+<span class="normal">79</span>
+<span class="normal">80</span>
+<span class="normal">81</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">fetch_results</span><span class="p">(</span>
+    <span class="n">query</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+    <span class="n">qa</span><span class="p">:</span> <span class="n">langchain</span><span class="o">.</span><span class="n">chains</span><span class="o">.</span><span class="n">retrieval_qa</span><span class="o">.</span><span class="n">base</span><span class="o">.</span><span class="n">RetrievalQA</span><span class="p">,</span>
+    <span class="n">type_of_query</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+    <span class="n">config</span><span class="p">:</span> <span class="nb">dict</span><span class="p">,</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Document</span><span class="p">]:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Fetch results for the query using the QA chain.</span>
 
@@ -1254,17 +1288,7 @@ <h2 id="results_gen.get_result_from_query" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/results_gen.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">162</span>
-<span class="normal">163</span>
-<span class="normal">164</span>
-<span class="normal">165</span>
-<span class="normal">166</span>
-<span class="normal">167</span>
-<span class="normal">168</span>
-<span class="normal">169</span>
-<span class="normal">170</span>
-<span class="normal">171</span>
-<span class="normal">172</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">172</span>
 <span class="normal">173</span>
 <span class="normal">174</span>
 <span class="normal">175</span>
@@ -1280,7 +1304,21 @@ <h2 id="results_gen.get_result_from_query" class="doc doc-heading">
 <span class="normal">185</span>
 <span class="normal">186</span>
 <span class="normal">187</span>
-<span class="normal">188</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_result_from_query</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">qa</span><span class="p">,</span> <span class="n">type_of_query</span><span class="p">,</span> <span class="n">config</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Document</span><span class="p">]]:</span>
+<span class="normal">188</span>
+<span class="normal">189</span>
+<span class="normal">190</span>
+<span class="normal">191</span>
+<span class="normal">192</span>
+<span class="normal">193</span>
+<span class="normal">194</span>
+<span class="normal">195</span>
+<span class="normal">196</span>
+<span class="normal">197</span>
+<span class="normal">198</span>
+<span class="normal">199</span>
+<span class="normal">200</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_result_from_query</span><span class="p">(</span>
+    <span class="n">query</span><span class="p">,</span> <span class="n">qa</span><span class="p">,</span> <span class="n">type_of_query</span><span class="p">,</span> <span class="n">config</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Document</span><span class="p">]]:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Get the result from the query using the QA chain and return the results in a dataframe that is then sent to the frontend.</span>
 
@@ -1306,7 +1344,7 @@ <h2 id="results_gen.get_result_from_query" class="doc doc-heading">
     <span class="n">dict_results</span><span class="p">,</span> <span class="n">ids_order</span> <span class="o">=</span> <span class="n">process_documents</span><span class="p">(</span><span class="n">source_documents</span><span class="p">)</span>
     <span class="n">output_df</span> <span class="o">=</span> <span class="n">create_output_dataframe</span><span class="p">(</span><span class="n">dict_results</span><span class="p">,</span> <span class="n">type_of_query</span><span class="p">,</span> <span class="n">ids_order</span><span class="p">)</span>
 
-    <span class="k">return</span> <span class="n">output_df</span><span class="p">,</span> <span class="n">source_documents</span>
+    <span class="k">return</span> <span class="n">output_df</span><span class="p">,</span> <span class="n">ids_order</span>
 </code></pre></div></td></tr></table></div>
             </details>
     </div>
@@ -1332,10 +1370,7 @@ <h2 id="results_gen.long_context_reorder" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/results_gen.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">16</span>
-<span class="normal">17</span>
-<span class="normal">18</span>
-<span class="normal">19</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">19</span>
 <span class="normal">20</span>
 <span class="normal">21</span>
 <span class="normal">22</span>
@@ -1346,7 +1381,10 @@ <h2 id="results_gen.long_context_reorder" class="doc doc-heading">
 <span class="normal">27</span>
 <span class="normal">28</span>
 <span class="normal">29</span>
-<span class="normal">30</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">long_context_reorder</span><span class="p">(</span><span class="n">results</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Document</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Document</span><span class="p">]:</span>
+<span class="normal">30</span>
+<span class="normal">31</span>
+<span class="normal">32</span>
+<span class="normal">33</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">long_context_reorder</span><span class="p">(</span><span class="n">results</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Document</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Document</span><span class="p">]:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Lost in the middle reorder: the less relevant documents will be at the</span>
 <span class="sd">    middle of the list and more relevant elements at beginning / end.</span>
@@ -1382,11 +1420,11 @@ <h2 id="results_gen.make_clickable" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/results_gen.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">94</span>
-<span class="normal">95</span>
-<span class="normal">96</span>
-<span class="normal">97</span>
-<span class="normal">98</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">make_clickable</span><span class="p">(</span><span class="n">val</span> <span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">102</span>
+<span class="normal">103</span>
+<span class="normal">104</span>
+<span class="normal">105</span>
+<span class="normal">106</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">make_clickable</span><span class="p">(</span><span class="n">val</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Make the URL clickable in the dataframe.</span>
 <span class="sd">    &quot;&quot;&quot;</span>
@@ -1414,22 +1452,22 @@ <h2 id="results_gen.process_documents" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>backend/modules/results_gen.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">76</span>
-<span class="normal">77</span>
-<span class="normal">78</span>
-<span class="normal">79</span>
-<span class="normal">80</span>
-<span class="normal">81</span>
-<span class="normal">82</span>
-<span class="normal">83</span>
-<span class="normal">84</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">84</span>
 <span class="normal">85</span>
 <span class="normal">86</span>
 <span class="normal">87</span>
 <span class="normal">88</span>
 <span class="normal">89</span>
 <span class="normal">90</span>
-<span class="normal">91</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">process_documents</span><span class="p">(</span><span class="n">source_documents</span> <span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Document</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">OrderedDict</span><span class="p">,</span> <span class="nb">list</span><span class="p">]:</span>
+<span class="normal">91</span>
+<span class="normal">92</span>
+<span class="normal">93</span>
+<span class="normal">94</span>
+<span class="normal">95</span>
+<span class="normal">96</span>
+<span class="normal">97</span>
+<span class="normal">98</span>
+<span class="normal">99</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">process_documents</span><span class="p">(</span><span class="n">source_documents</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Document</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">OrderedDict</span><span class="p">,</span> <span class="nb">list</span><span class="p">]:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Process the source documents and create a dictionary with the key_name as the key and the name and page content as the values.</span>
 
diff --git a/site/objects.inv b/site/objects.inv
index 4e4b49e6667895bba9db952e8971782f7a3f6eb9..27d34f25b1c985406de3f5ee703d19940174b8bb 100644
GIT binary patch
delta 765
zcmV<Z0s{T91^fn(i+{~nO>f&U488AH1lX=~u^o5aFzm3Qm)!+fqFuH6aP^Tc`}Lz_
zJGC3zjcfzmB?&0Krzeu8$<(01Qb~dr(jA)97iD;maQJj+f4G}^xZ~#ihr`ErBU4cc
z%?XF2U=RtapqUp&=TJyPIblXuq9$|&o0Mu&JK9P6`@FB99Dn%@3ktM%#2doPnS-X#
z%mVd5j=;RJ#Nh{Ipj+vP4_Z71c-E1{;n7u6LfT9;Rgj5}c6LXF&cF#Ryr5G2g;nqr
z;27D)8V1f0PCd=V_ufiG+X9jJ&<HXYwDLd)J~Ta;m+1<R86F@GZ2wH0(;QRCo?<*M
zw)J^B6{iJMLw~IU)aIVRI?}~_8Lq(77|ewVa^5n$(->FWHlTm~{Dubf)DQHI1RKei
zq|WWYl@h<-H@ea1dG1Nx+wvZxI7xe*lEI_ID7%9l)m%_tr>K1z_IJ*hcSg&1X`Xfo
z_<-mwqdk=IAB1`+=S|~0f^3WQWl%UcW<`_kGF?-*uzydwY#(Zab|r~P3vTBF7SnG#
z{d_uY=kEK29q5@+rO>boRrgkH89BZoD`OQ+jTo=C6Q8Z|qq6g@-8G!>$6Ly<Tk_tD
zGWacAF;%IQ&jKrJRF}LPXzz{C&-%8c|0ow*b(N4?uB^b;kS>kgc2GLa3J#^}dIPJY
z$3d8pF@IO31s5a^E;vd|He2m_T)x;}(7Luzr$j@WXe5x>%?WUAOKpPoGg_8tv9hp-
z#B^GNZd2a`-vs|^;x92(;MpDNEGzIa$-DKGKx=O#jjrP~>+uOhLSuRR!po#l^Z`qa
z6zqYPa*aREv(_t9=1YzsCYpIHbzng=81E^#aCVKPrx~};p(+4gQp_4wUX6&sjl&9_
z)EMm`3z!QM(u3_Mpj>o6MsX4v9+P_&e#KpF(6K}TRZ{pXF`YygO4t6g@LCMfCj^VN
v$3j^A#_zA`f16=P*R3w@FQSRGrK5f23H$Qo44$)wRq(5(^AX)Y@LU#Bnx1~f

delta 561
zcmV-10?z&X2C)T@i+{D2!IIk`42JK0ip;dvdZ*XD_7!?(Fvt!Y0bUSkw@<$U635vl
z+1O4mb|Uyc3n3(Wa1bPlL<Zw#czd>9Cxp$9P5lv{`ss|PuirM`znqzx&NwYJ?2Up9
z=#o=pyqkkSFSStKb*2|`0S(b!^quXZ{vPKwQZwJHWWiAs<bQ>3uYBl|S_O~<Vh@UZ
z5Ro4+LtR~GeRAeC!O4$V4B}8BLYXX_v@lbh?P@EHo1rt#kdbKq!XkOu;JmX#@CsrS
zT9TZrKSry7)@?-Lu@P{tXyFGVP#TV6-IgW1-Wvf6By+%SoYHa1bzP&pZH)tQG++7p
zU8oxU15pd(I)5LmEE%c0_2AK(A6>Yf<M%M(4+W_>ZHR(t6Rlz8^CJ;Y4tBv?;RkRQ
zS7P(unBNdpFhilNBX;P2KM?mH)?6XNw=aA$;tJTLJ}IfTYqt2s8+Xc;HBQwka~8GE
z;2Pg7gdR!)atlb-l?E2sRoAbePK$D!Z1e+=B?TYAi+^%IsS3R4xq%E26LQ1n7JRXQ
z*6}?2&-_OAeSp5yCAOIXL2&mD24ZbRJ~_JL%PlOYsoE9N+Y)uHls?gr=+ioRi_`(B
ztvFZ~=<`87jF%i=x+3(h&$mG0g3Ul5G=v2hD|cK3(5RhkaY;PJ<M7Gt>?4CObtJfA
z5TC~87b?GhmK$wWE0i&^^e}pJev?h2CP(doCVX^Z250nPk^HWE%Ngz+3EGS}0@Ds;

diff --git a/site/query_llm/index.html b/site/query_llm/index.html
new file mode 100644
index 0000000..4ddd100
--- /dev/null
+++ b/site/query_llm/index.html
@@ -0,0 +1,1001 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+      
+      
+      
+        <link rel="prev" href="../inference/">
+      
+      
+        <link rel="next" href="../testing/">
+      
+      
+      <link rel="icon" href="../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.20">
+    
+    
+      
+        <title>LLM Query parsing - OpenML RAG Documentation</title>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/stylesheets/main.66ac8b77.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+      <link rel="stylesheet" href="../assets/_mkdocstrings.css">
+    
+      <link rel="stylesheet" href="../css/ansi-colours.css">
+    
+      <link rel="stylesheet" href="../css/jupyter-cells.css">
+    
+      <link rel="stylesheet" href="../css/pandas-dataframe.css">
+    
+    <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    <body dir="ltr">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#llm-query-parsing" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href=".." title="OpenML RAG Documentation" class="md-header__button md-logo" aria-label="OpenML RAG Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            OpenML RAG Documentation
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              LLM Query parsing
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+    
+      <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href=".." title="OpenML RAG Documentation" class="md-nav__button md-logo" aria-label="OpenML RAG Documentation" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54Z"/></svg>
+
+    </a>
+    OpenML RAG Documentation
+  </label>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href=".." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    RAG pipeline for OpenML
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../configuration/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Configuration
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../docker/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Docker container
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../inference/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Inference
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#llm_service" class="md-nav__link">
+    <span class="md-ellipsis">
+      llm_service
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_service.create_chain" class="md-nav__link">
+    <span class="md-ellipsis">
+      create_chain
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_service.parse_answers_initial" class="md-nav__link">
+    <span class="md-ellipsis">
+      parse_answers_initial
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../testing/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Testing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../training/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Training
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        
+          
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Developer tutorials
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
+            <span class="md-nav__icon md-icon"></span>
+            Developer tutorials
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../developer%20tutorials/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Developer Tutorials
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../developer%20tutorials/change%20model/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Change model
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../developer%20tutorials/create%20vectordb/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Create vectordb
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../developer%20tutorials/get%20an%20llm%20summary/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Get an llm summary
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../developer%20tutorials/load%20vectordb%20and%20get%20results/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Load vectordb and get results
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../developer%20tutorials/run%20multiple%20queries%20and%20aggregate/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Run multiple queries and aggregate
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../developer%20tutorials/train%20and%20evaluate%20models/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Train and evaluate models
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
+        
+          
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Modules
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
+            <span class="md-nav__icon md-icon"></span>
+            Modules
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../modules/general_utils/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    General utils
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../modules/llm_module/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Llm module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../modules/metadata_module/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Metadata module
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../modules/result_gen/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Result gen
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#llm_service" class="md-nav__link">
+    <span class="md-ellipsis">
+      llm_service
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_service.create_chain" class="md-nav__link">
+    <span class="md-ellipsis">
+      create_chain
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#llm_service.parse_answers_initial" class="md-nav__link">
+    <span class="md-ellipsis">
+      parse_answers_initial
+    </span>
+  </a>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+
+<h1 id="llm-query-parsing">LLM Query parsing<a class="headerlink" href="#llm-query-parsing" title="Permanent link">&para;</a></h1>
+<ul>
+<li>The LLM reads the query and parses it into a list of filters based on a prompt
+</li>
+</ul>
+
+
+<div class="doc doc-object doc-module">
+
+
+
+<a id="llm_service"></a>
+    <div class="doc doc-contents first">
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_service.create_chain" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">create_chain</span><span class="p">(</span><span class="n">prompt</span><span class="p">,</span> <span class="n">model</span><span class="o">=</span><span class="s1">&#39;llama3&#39;</span><span class="p">,</span> <span class="n">temperature</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span></code>
+
+<a href="#llm_service.create_chain" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Create a chain with the given prompt and model</p>
+<p>Input: prompt (str), model (str), temperature (float)</p>
+<p>Returns: chain (Chain)</p>
+
+            <details class="quote">
+              <summary>Source code in <code>llm_service/llm_service.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">18</span>
+<span class="normal">19</span>
+<span class="normal">20</span>
+<span class="normal">21</span>
+<span class="normal">22</span>
+<span class="normal">23</span>
+<span class="normal">24</span>
+<span class="normal">25</span>
+<span class="normal">26</span>
+<span class="normal">27</span>
+<span class="normal">28</span>
+<span class="normal">29</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">create_chain</span><span class="p">(</span><span class="n">prompt</span><span class="p">,</span> <span class="n">model</span><span class="o">=</span><span class="s2">&quot;llama3&quot;</span><span class="p">,</span> <span class="n">temperature</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Create a chain with the given prompt and model</span>
+
+<span class="sd">    Input: prompt (str), model (str), temperature (float)</span>
+
+<span class="sd">    Returns: chain (Chain)</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">llm</span> <span class="o">=</span> <span class="n">ChatOllama</span><span class="p">(</span><span class="n">model</span><span class="o">=</span><span class="n">model</span><span class="p">,</span> <span class="n">temperature</span><span class="o">=</span><span class="n">temperature</span><span class="p">)</span>
+    <span class="n">prompt</span> <span class="o">=</span> <span class="n">ChatPromptTemplate</span><span class="o">.</span><span class="n">from_template</span><span class="p">(</span><span class="n">prompt</span><span class="p">)</span>
+
+    <span class="k">return</span> <span class="n">prompt</span> <span class="o">|</span> <span class="n">llm</span> <span class="o">|</span> <span class="n">StrOutputParser</span><span class="p">()</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h2 id="llm_service.parse_answers_initial" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">parse_answers_initial</span><span class="p">(</span><span class="n">response</span><span class="p">)</span></code>
+
+<a href="#llm_service.parse_answers_initial" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Description: Parse the answers from the initial response</p>
+<p>Input: response (str)</p>
+<p>Returns: answers (list)</p>
+
+            <details class="quote">
+              <summary>Source code in <code>llm_service/llm_service.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">32</span>
+<span class="normal">33</span>
+<span class="normal">34</span>
+<span class="normal">35</span>
+<span class="normal">36</span>
+<span class="normal">37</span>
+<span class="normal">38</span>
+<span class="normal">39</span>
+<span class="normal">40</span>
+<span class="normal">41</span>
+<span class="normal">42</span>
+<span class="normal">43</span>
+<span class="normal">44</span>
+<span class="normal">45</span>
+<span class="normal">46</span>
+<span class="normal">47</span>
+<span class="normal">48</span>
+<span class="normal">49</span>
+<span class="normal">50</span>
+<span class="normal">51</span>
+<span class="normal">52</span>
+<span class="normal">53</span>
+<span class="normal">54</span>
+<span class="normal">55</span>
+<span class="normal">56</span>
+<span class="normal">57</span>
+<span class="normal">58</span>
+<span class="normal">59</span>
+<span class="normal">60</span>
+<span class="normal">61</span>
+<span class="normal">62</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">parse_answers_initial</span><span class="p">(</span><span class="n">response</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Description: Parse the answers from the initial response</span>
+
+<span class="sd">    Input: response (str)</span>
+
+<span class="sd">    Returns: answers (list)</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">patterns</span> <span class="o">=</span> <span class="p">[</span>
+        <span class="sa">r</span><span class="s2">&quot;^(yes|no|none)&quot;</span><span class="p">,</span>
+        <span class="sa">r</span><span class="s2">&quot;^(ascending|descending)&quot;</span><span class="p">,</span>
+        <span class="sa">r</span><span class="s2">&quot;(multi-class|binary|multi-label)&quot;</span>
+    <span class="p">]</span>
+
+    <span class="n">answers</span> <span class="o">=</span> <span class="p">[]</span>
+    <span class="n">lines</span> <span class="o">=</span> <span class="n">response</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">)</span>
+
+    <span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="n">lines</span><span class="p">:</span>
+        <span class="k">if</span> <span class="s2">&quot;?&quot;</span> <span class="ow">in</span> <span class="n">line</span><span class="p">:</span>
+            <span class="c1"># Extract the part of the line after the question mark</span>
+            <span class="n">potential_answer</span> <span class="o">=</span> <span class="n">line</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;?&quot;</span><span class="p">)[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">potential_answer</span> <span class="o">=</span> <span class="n">line</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
+
+        <span class="c1"># Check if the potential answer matches any of the patterns</span>
+        <span class="k">for</span> <span class="n">pattern</span> <span class="ow">in</span> <span class="n">patterns</span><span class="p">:</span>
+            <span class="k">if</span> <span class="n">re</span><span class="o">.</span><span class="n">match</span><span class="p">(</span><span class="n">pattern</span><span class="p">,</span> <span class="n">potential_answer</span><span class="p">):</span>
+                <span class="n">answers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">potential_answer</span><span class="p">)</span>
+                <span class="k">break</span>  <span class="c1"># Stop checking other patterns if a match is found</span>
+
+    <span class="k">return</span> <span class="n">answers</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "..", "features": ["content.code.copy"], "search": "../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../assets/javascripts/bundle.dd8806f2.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/site/search/search_index.json b/site/search/search_index.json
index 82c1267..f34e061 100644
--- a/site/search/search_index.json
+++ b/site/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"RAG pipeline for OpenML","text":"<ul> <li>This repository contains the code for the RAG pipeline for OpenML. </li> <li>Project roadmap</li> </ul>"},{"location":"#getting-started","title":"Getting started","text":"<ul> <li>Clone the repository</li> <li>Create a virtual environment and activate it</li> <li>Install the requirements using <code>pip install -r requirements.txt</code></li> <li>Run training.py (for the first time/to update the model). This takes care of basically everything. (Refer to the training section for more details)</li> <li>Install Ollama (https://ollama.com/) for your machine</li> </ul> <ul> <li>For a local setup, you can run ./start_local.sh to start Olama, FastAPI and Streamlit servers. The Streamlit server will be available at http://localhost:8501</li> <li>For docker, refer to Docker</li> <li>For a complete usage example refer to pipeline usage</li> <li>Enjoy :)</li> </ul>"},{"location":"#example-usage","title":"Example usage","text":"<ul> <li>Note that in this picture, I am using a very very tiny model for demonstration purposes. The actual results would be a lot better :)</li> <li></li> </ul>"},{"location":"#where-do-i-go-from-here","title":"Where do I go from here?","text":""},{"location":"#i-am-a-developer-and-i-want-to-contribute-to-the-project","title":"I am a developer and I want to contribute to the project","text":"<ul> <li>Hello! We are glad you are here. To get started, refer to the tutorials in the developer tutorial section.</li> <li>If you have any questions, feel free to ask or post an issue.</li> </ul>"},{"location":"#i-just-want-to-use-the-pipeline","title":"I just want to use the pipeline","text":"<ul> <li>You can use the pipeline by running the Streamlit frontend. Refer to the getting started section above for more details.</li> </ul>"},{"location":"#i-am-on-the-wrong-page","title":"I am on the wrong page","text":""},{"location":"configuration/","title":"Configuration","text":"<ul> <li>The main config file is <code>config.json</code> </li> <li>Possible options are as follows:</li> <li>rqa_prompt_template: The template for the RAG pipeline search prompt. This is used by the model to query the database. </li> <li>llm_prompt_template: The template for the summary generator LLM prompt.</li> <li>num_return_documents: Number of documents to return for a query. Too high a number can lead to Out of Memory errors. (Defaults to 50)</li> <li>embedding_model: The model to use for generating embeddings. This is used to generate embeddings for the documents as a means of comparison using the LLM's embeddings. (Defaults to BAAI/bge-large-en-v1.5)<ul> <li>Other possible tested models<ul> <li>BAAI/bge-base-en-v1.5</li> <li>BAAI/bge-large-en-v1.5</li> <li>WhereIsAI/UAE-Large-V1</li> </ul> </li> </ul> </li> <li>llm_model: The model used for generating the result summary. (Defaults to qwen2:1.5b)</li> <li>data_dir: The directory to store the intermediate data like tables/databases etc. (Defaults to ./data/)</li> <li>persist_dir: The directory to store the cached data. Defaults to ./data/chroma_db/ and stores the embeddings for the documents with a unique hash. (Defaults to ./data/chroma_db/)</li> <li>testing_flag: Enables testing mode by using subsets of the data for quick debugging. This is used to test the pipeline and is not recommended for normal use. (Defaults to False)</li> <li>data_download_n_jobs: Number of jobs to run in parallel for downloading data. (Defaults to 20)</li> <li>training: Whether to train the model or not. (Defaults to False) this is automatically set to True when when running the training.py script. Do NOT set this to True manually.</li> <li>search_type : The type of vector comparison to use. (Defaults to \"similarity\")</li> <li>reraanking: Whether to rerank the results using the FlashRank algorithm. (Defaults to False)</li> <li>long_context_reordering: Whether to reorder the results using the Long Context Reordering algorithm. (Defaults to False)</li> </ul>"},{"location":"docker/","title":"Docker container","text":""},{"location":"docker/#building","title":"Building","text":"<ul> <li>Run <code>docker compose build --progress=plain</code></li> </ul>"},{"location":"docker/#running","title":"Running","text":"<ul> <li>Run <code>./start_docker.sh</code></li> <li>This uses the docker compose file to run the docker process in the background.</li> <li>The required LLM model is also pulled from the docker hub and the container is started.</li> </ul>"},{"location":"docker/#stopping","title":"Stopping","text":"<ul> <li>Run <code>./stop_docker.sh</code></li> </ul>"},{"location":"docker/#potential-errors","title":"Potential Errors","text":"<ul> <li>Permission errors : Run <code>chmod +x *.sh</code></li> <li>If you get a memory error you can run <code>docker system prune</code>. Please be careful with this command as it will remove all stopped containers, all dangling images, and all unused networks. So ensure you have no important data in any of the containers before running this command.</li> <li>On docker desktop for Mac, increase memory limits to as much as your system can handle.</li> </ul>"},{"location":"inference/","title":"Inference","text":"<ul> <li>Just run ./start_local.sh and it will take care of everything.</li> <li>The UI should either pop up or you can navigate to http://localhost:8501/ in your browser.</li> <li>Note that it takes a decent bit of time to load everything. (Approximately 10-15 mins on a decent Macbook Pro, and much slower with Docker)</li> </ul>"},{"location":"inference/#stopping","title":"Stopping","text":"<ul> <li>Run ./stop_local.sh</li> <li>./start_local.sh stores the PIDs of all the processes it starts in files in all the directories it starts them in. stop_local.sh reads these files and kills the processes.</li> </ul>"},{"location":"inference/#errors","title":"Errors","text":"<ul> <li>If you get an error about file permissions, run <code>chmod +x start_local.sh</code> and <code>chmod +x stop_local.sh</code> to make them executable.</li> </ul>"},{"location":"testing/","title":"Testing","text":""},{"location":"testing/#unit-testing","title":"Unit Testing","text":"<ul> <li>Run <code>python -m unittest tests/unit_testing.py</code> to run the unit tests.</li> </ul>"},{"location":"testing/#load-testing","title":"Load Testing","text":"<ul> <li>Load testing can be done using Locust, a load testing tool that allows you to simulate users querying the API and measure the performance of the API under load from numerous users.</li> <li>It is possible to configure the number of users, the hatch rate, and the time to run the test for.</li> </ul>"},{"location":"testing/#running-the-load-test","title":"Running the load test","text":"<ul> <li>Start the FastAPI server using <code>uvicorn main:app</code> (or <code>./start_local.sh</code> )</li> <li>Load testing using Locust (<code>locust -f tests/locust_test.py --host http://127.0.0.1:8000</code> ) using a different terminal</li> </ul>"},{"location":"testing/#all-tests","title":"All tests","text":"<p>               Bases: <code>TestCase</code></p> Source code in <code>tests/unit_testing.py</code> <pre><code>class TestConfig(unittest.TestCase):\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.client = chromadb.PersistentClient(path=config[\"persist_dir\"])\n        self.config_keys = [\"rqa_prompt_template\", \"llm_prompt_template\",\n        \"num_return_documents\", \"embedding_model\", \"llm_model\", \"num_documents_for_llm\", \"data_dir\", \"persist_dir\", \"testing_flag\", \"ignore_downloading_data\", \"test_subset_2000\", \"data_download_n_jobs\", \"training\", \"temperature\", \"top_p\", \"search_type\", \"reranking\", \"long_context_reorder\"]\n        self.query_test_dict = {\n            \"dataset\": \"Find me a dataset about flowers that has a high number of instances.\",\n            \"flow\": \"Find me a flow that uses the RandomForestClassifier.\",\n        }\n    def test_check_data_dirs(self):\n        \"\"\"\n        Description: Check if the data directory exists.\n        Returns: None\n        \"\"\"\n        self.assertTrue(os.path.exists(config[\"data_dir\"]))\n        self.assertTrue(os.path.exists(config[\"persist_dir\"]))\n\n    def test_config(self):\n        \"\"\"\n        Description: Check if the config has the required keys.\n        Returns: None\n        \"\"\"\n        for key in self.config_keys:\n            self.assertIn(key, config.keys())\n\n    def test_setup_vector_db_and_qa(self):\n        \"\"\"\n        Description: Check if the setup_vector_db_and_qa function works as expected.\n        Returns: None\n        \"\"\"\n        for type_of_data in [\"dataset\", \"flow\"]:\n            self.qa = setup_vector_db_and_qa(\n                config=config, data_type=type_of_data, client=self.client\n            )\n            self.assertIsNotNone(self.qa)\n            self.result_data_frame = get_result_from_query(\n                query=self.query_test_dict[type_of_data],\n                qa=self.qa,\n                type_of_query=type_of_data,\n                config=config,\n            )\n            self.assertIsNotNone(self.result_data_frame)\n</code></pre>"},{"location":"testing/#unit_testing.TestConfig.test_check_data_dirs","title":"<code>test_check_data_dirs()</code>","text":"<p>Description: Check if the data directory exists. Returns: None</p> Source code in <code>tests/unit_testing.py</code> <pre><code>def test_check_data_dirs(self):\n    \"\"\"\n    Description: Check if the data directory exists.\n    Returns: None\n    \"\"\"\n    self.assertTrue(os.path.exists(config[\"data_dir\"]))\n    self.assertTrue(os.path.exists(config[\"persist_dir\"]))\n</code></pre>"},{"location":"testing/#unit_testing.TestConfig.test_config","title":"<code>test_config()</code>","text":"<p>Description: Check if the config has the required keys. Returns: None</p> Source code in <code>tests/unit_testing.py</code> <pre><code>def test_config(self):\n    \"\"\"\n    Description: Check if the config has the required keys.\n    Returns: None\n    \"\"\"\n    for key in self.config_keys:\n        self.assertIn(key, config.keys())\n</code></pre>"},{"location":"testing/#unit_testing.TestConfig.test_setup_vector_db_and_qa","title":"<code>test_setup_vector_db_and_qa()</code>","text":"<p>Description: Check if the setup_vector_db_and_qa function works as expected. Returns: None</p> Source code in <code>tests/unit_testing.py</code> <pre><code>def test_setup_vector_db_and_qa(self):\n    \"\"\"\n    Description: Check if the setup_vector_db_and_qa function works as expected.\n    Returns: None\n    \"\"\"\n    for type_of_data in [\"dataset\", \"flow\"]:\n        self.qa = setup_vector_db_and_qa(\n            config=config, data_type=type_of_data, client=self.client\n        )\n        self.assertIsNotNone(self.qa)\n        self.result_data_frame = get_result_from_query(\n            query=self.query_test_dict[type_of_data],\n            qa=self.qa,\n            type_of_query=type_of_data,\n            config=config,\n        )\n        self.assertIsNotNone(self.result_data_frame)\n</code></pre>"},{"location":"training/","title":"Training","text":"<ul> <li>While we are not creating a new model, we are using the existing model to create embeddings. The name might be misleading but this was chosen as an attempt to keep the naming consistent with other codebases.</li> <li>(Perhaps we might fine tune the model in the future)</li> <li>The training script is present in <code>training.py</code>. Running this script will take care of everything.</li> </ul>"},{"location":"training/#what-does-the-training-script-do","title":"What does the training script do?","text":"<ul> <li>Load the config file and set the necessary variables</li> <li>If <code>testing_flag</code> is set to True, the script will use a subset of the data for quick debugging</li> <li>testing_flag is set to True</li> <li>persist_dir is set to ./data/chroma_db_testing</li> <li>test_subset_2000 is set to True</li> <li>data_dir is set to ./data/testing_data/</li> <li>If <code>testing_flag</code> is set to False, the script will use the entire dataset</li> <li>For all datasets in the OpenML dataset list:</li> <li>Download the dataset</li> <li>Create the vector dataset with computed embeddings</li> <li>Create a vectordb retriever </li> <li>Run some test queries</li> </ul>"},{"location":"developer%20tutorials/","title":"Developer Tutorials","text":"<ul> <li>Hello there, future OpenML contributor! It is nice meeting you here. This page is a collection of tutorials that will help you get started with contributing to the OpenML RAG pipeline.</li> <li>The tutorials show you how to perform common tasks and should make it a lot easier to get started with contributing to this project.</li> <li>Note that you would have had to setup the project before you begin. If you missed this step, please refer to index</li> </ul>"},{"location":"developer%20tutorials/change%20model/","title":"Change model","text":"<pre><code>from __future__ import annotations\nfrom langchain_community.cache import SQLiteCache\nimport os\nimport sys\nimport chromadb\n# change the path to the backend directory\nsys.path.append(os.path.join(os.path.dirname(\".\"), '../../backend/'))\n</code></pre> <pre><code>from modules.utils import load_config_and_device\nfrom modules.llm import setup_vector_db_and_qa\n</code></pre> <pre><code>config = load_config_and_device(\"../../backend/config.json\")\nconfig[\"persist_dir\"] = \"../backend/data/chroma_db/\"\nconfig[\"data_dir\"] = \"../backend/data/\"\nconfig[\"type_of_data\"] = \"dataset\"\nconfig[\"training\"] = True\n# load the persistent database using ChromaDB\nclient = chromadb.PersistentClient(path=config[\"persist_dir\"])\nprint(config)\n</code></pre> <pre>\n<code>[INFO] Finding device.\n[INFO] Device found: cpu\n{'rqa_prompt_template': 'This database is a list of metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the {question} given. If you do not know the answer, say you do not know. {context}', 'llm_prompt_template': 'The following is a set of documents {docs}. Based on these docs, please summarize the content concisely. Also give a list of main concepts found in the documents. Do not add any new information. Helpful Answer: ', 'num_return_documents': 50, 'embedding_model': 'BAAI/bge-large-en-v1.5', 'llm_model': 'qwen2:1.5b', 'num_documents_for_llm': 10, 'data_dir': '../backend/data/', 'persist_dir': '../backend/data/chroma_db/', 'testing_flag': False, 'ignore_downloading_data': False, 'test_subset_2000': False, 'data_download_n_jobs': 20, 'training': True, 'temperature': 0.95, 'top_p': 0.95, 'search_type': 'similarity', 'reranking': False, 'long_context_reorder': False, 'device': 'cpu', 'type_of_data': 'dataset'}\n</code>\n</pre> <pre><code>config[\"embedding_model\"] = \"HuggingFaceH4/capybara\"\n</code></pre> <ul> <li>Pick a model from Ollama - https://ollama.com/library?sort=popular</li> <li>eg : mistral</li> </ul> <pre><code>config[\"llm_model\"] = \"mistral\"\n</code></pre> <pre><code>qa = setup_vector_db_and_qa(\n        config=config, data_type=config[\"type_of_data\"], client=client\n    )\n</code></pre>"},{"location":"developer%20tutorials/change%20model/#tutorial-on-changing-models","title":"Tutorial on changing models","text":"<ul> <li>How would you use a different embedding and llm model?</li> </ul>"},{"location":"developer%20tutorials/change%20model/#initial-config","title":"Initial config","text":""},{"location":"developer%20tutorials/change%20model/#embedding-model","title":"Embedding model","text":"<ul> <li>Pick a model from HF</li> </ul>"},{"location":"developer%20tutorials/change%20model/#llm-model","title":"LLM model","text":""},{"location":"developer%20tutorials/change%20model/#important","title":"IMPORTANT","text":"<ul> <li>Do NOT forget to change the model to the best model in ollama/get_ollama.sh</li> </ul>"},{"location":"developer%20tutorials/create%20vectordb/","title":"Create vectordb","text":"<pre><code>from __future__ import annotations\nfrom langchain.globals import set_llm_cache\nfrom langchain_community.cache import SQLiteCache\nimport os\nimport sys\nimport chromadb\n# change the path to the backend directory\nsys.path.append(os.path.join(os.path.dirname(\".\"), '../../backend/'))\n</code></pre> <pre><code>from modules.utils import get_all_metadata_from_openml, create_metadata_dataframe, load_config_and_device\nfrom modules.llm import load_document_and_create_vector_store, setup_vector_db_and_qa\n</code></pre> <pre><code>config = load_config_and_device(\"../../backend/config.json\")\nconfig[\"persist_dir\"] = \"../backend/data/chroma_db/\"\nconfig[\"data_dir\"] = \"../backend/data/\"\nconfig[\"type_of_data\"] = \"dataset\"\nconfig[\"training\"] = True\n\n# load the persistent database using ChromaDB\nclient = chromadb.PersistentClient(path=config[\"persist_dir\"])\nprint(config)\n</code></pre> <pre>\n<code>[INFO] Finding device.\n[INFO] Device found: cpu\n{'rqa_prompt_template': 'This database is a list of metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the {question} given. If you do not know the answer, say you do not know. {context}', 'llm_prompt_template': 'The following is a set of documents {docs}. Based on these docs, please summarize the content concisely. Also give a list of main concepts found in the documents. Do not add any new information. Helpful Answer: ', 'num_return_documents': 50, 'embedding_model': 'BAAI/bge-large-en-v1.5', 'llm_model': 'qwen2:1.5b', 'num_documents_for_llm': 10, 'data_dir': '../backend/data/', 'persist_dir': '../backend/data/chroma_db/', 'testing_flag': False, 'ignore_downloading_data': False, 'test_subset_2000': False, 'data_download_n_jobs': 20, 'training': True, 'temperature': 0.95, 'top_p': 0.95, 'search_type': 'similarity', 'reranking': False, 'long_context_reorder': False, 'device': 'cpu', 'type_of_data': 'dataset'}\n</code>\n</pre> <pre><code># Download the data if it does not exist\nopenml_data_object, data_id, all_metadata = get_all_metadata_from_openml(\n    config=config\n)\n# Create the combined metadata dataframe\nmetadata_df, all_metadata = create_metadata_dataframe(\n    openml_data_object, data_id, all_metadata, config=config\n)\n# Create the vector store\nvectordb = load_document_and_create_vector_store(\n    metadata_df, config=config, chroma_client=client\n)\n</code></pre> <pre><code>qa = setup_vector_db_and_qa(\n        config=config, data_type=config[\"type_of_data\"], client=client\n    )\n</code></pre>"},{"location":"developer%20tutorials/create%20vectordb/#tutorial-on-creating-a-vector-database-with-openml-objects","title":"Tutorial on creating a vector database with openml objects","text":"<ul> <li>How would you use the API to create a vector database with openml objects (datasets, flows etc)</li> </ul>"},{"location":"developer%20tutorials/create%20vectordb/#manually","title":"Manually","text":""},{"location":"developer%20tutorials/create%20vectordb/#api","title":"API","text":""},{"location":"developer%20tutorials/get%20an%20llm%20summary/","title":"Get an llm summary","text":"<pre><code>from __future__ import annotations\nfrom langchain.globals import set_llm_cache\nfrom langchain_community.cache import SQLiteCache\nimport os\nimport sys\n# change the path to the backend directory\nsys.path.append(os.path.join(os.path.dirname(\".\"), '../../backend/'))\n</code></pre> <pre><code>from modules.llm import get_llm_chain, get_llm_result_from_string\nfrom modules.utils import load_config_and_device\n</code></pre> <pre><code># Config and DB\n\n# load the configuration and device\nconfig = load_config_and_device(\"../../backend/config.json\")\nconfig[\"persist_dir\"] = \"../../backend/data/chroma_db/\"\nconfig[\"data_dir\"] = \"../../backend/data/\"\n</code></pre> <pre>\n<code>[INFO] Finding device.\n[INFO] Device found: cpu\n</code>\n</pre> <pre><code>config[\"llm_prompt_template\"] = \"The following is a set of documents {docs}. Based on these docs, please summarize the content concisely. Also give a list of main concepts found in the documents. Do not add any new information. Helpful Answer: \"\nconfig[\"llm_model\"] = \"qwen2:1.5b\"\n</code></pre> <pre><code># get the llm chain and set the cache\nllm_chain = get_llm_chain(config=config, local=True)\n# use os path to ensure compatibility with all operating systems\nset_llm_cache(SQLiteCache(database_path=os.path.join(config[\"data_dir\"], \".langchain.db\")))\n</code></pre> <pre><code>get_llm_result_from_string(llm_chain, \"This document is about eating disorders and this one is about eating nice food\")\n</code></pre> <pre>\n<code>'Eating Disorders\\n\\n- Eating disorders refer to psychological and emotional conditions characterized by compulsive behaviors such as overeating or excessive restriction.\\n- These behaviors lead to significant weight loss, malnutrition, and serious health complications.\\n\\nEating Nice Food\\n\\n- This document focuses on the importance of eating good food for maintaining a healthy and balanced diet.\\n- It highlights how selecting nutrient-dense foods can aid in overall physical and mental well-being.'</code>\n</pre>"},{"location":"developer%20tutorials/get%20an%20llm%20summary/#getting-an-llm-summary-using-the-api","title":"Getting an LLM summary using the API","text":"<ul> <li>How would you use the API and an LLM model + prompt to generate a summary of the results obtained from the RAG pipeline?</li> </ul>"},{"location":"developer%20tutorials/get%20an%20llm%20summary/#get-llm-summary-of-a-string","title":"Get LLM summary of a string","text":"<ul> <li>Ensure that Ollama is running before this works <code>bash ollama/.get_ollama.sh</code> (or use the desktop Ollama app for testing)</li> <li>As you can tell, the data needs to be a string. To then get the results from a bunch of langchain documents, you must first concatenate the text you care about into a single string.</li> </ul>"},{"location":"developer%20tutorials/load%20vectordb%20and%20get%20results/","title":"Load vectordb and get results","text":"<pre><code>from __future__ import annotations\nfrom langchain.globals import set_llm_cache\nfrom langchain_community.cache import SQLiteCache\nimport os\nimport sys\nimport chromadb\n# change the path to the backend directory\nsys.path.append(os.path.join(os.path.dirname(\".\"), '../../backend/'))\n</code></pre> <pre><code>from modules.utils import load_config_and_device\nfrom modules.llm import setup_vector_db_and_qa\nfrom modules.results_gen import get_result_from_query\n</code></pre> <pre><code># Config and DB\n\n# load the configuration and device\nconfig = load_config_and_device(\"../../backend/config.json\")\nconfig[\"persist_dir\"] = \"../../backend/data/chroma_db/\"\nconfig[\"data_dir\"] = \"../../backend/data/\"\n</code></pre> <pre>\n<code>[INFO] Finding device.\n[INFO] Device found: cpu\n</code>\n</pre> <pre><code># load the persistent database using ChromaDB\nclient = chromadb.PersistentClient(path=config[\"persist_dir\"])\n</code></pre> <pre><code># Setup llm chain, initialize the retriever and llm, and setup Retrieval QA\nqa_dataset = setup_vector_db_and_qa(config=config, data_type=\"dataset\", client=client)\n</code></pre> <pre>\n<code>[INFO] Loading metadata from file.\n[INFO] Loading model...\n</code>\n</pre> <pre>\n<code>/Users/eragon/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n  warnings.warn(\n</code>\n</pre> <pre>\n<code>[INFO] Model loaded.\n</code>\n</pre> <pre><code>query = \"give me datasets about mushrooms\"\n</code></pre> <pre><code>res = qa_dataset.invoke(input = query, top_k=5)[:10]\nres\n</code></pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>[Document(page_content='### **Dataset Details**\\n![](https://meta-album.github.io/assets/img/samples/FNG.png)\\n\\n**Meta Album ID**: PLT.FNG  \\n**Meta Album URL**: [https://meta-album.github.io/datasets/FNG.html](https://meta-album.github.io/datasets/FNG.html)  \\n**Domain ID**: PLT  \\n**Domain Name**: Plants  \\n**Dataset ID**: FNG  \\n**Dataset Name**: Fungi  \\n**Short Description**: Fungi dataset from Denmark  \\n**\\\\# Classes**: 25  \\n**\\\\# Images**: 15122  \\n**Keywords**: fungi, ecology, plants  \\n**Data Format**: images  \\n**Image size**: 128x128  \\n\\n**License (original data release)**: BSD-3-Clause License  \\n**License URL(original data release)**: https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE\\n \\n**License (Meta-Album data release)**: BSD-3-Clause License  \\n**License URL (Meta-Album data release)**: [https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE](https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE)', metadata={'did': 44335, 'name': 'Meta_Album_FNG_Extended'}),\n Document(page_content='### **Dataset Details**\\n![](https://meta-album.github.io/assets/img/samples/FNG.png)\\n\\n**Meta Album ID**: PLT.FNG  \\n**Meta Album URL**: [https://meta-album.github.io/datasets/FNG.html](https://meta-album.github.io/datasets/FNG.html)  \\n**Domain ID**: PLT  \\n**Domain Name**: Plants  \\n**Dataset ID**: FNG  \\n**Dataset Name**: Fungi  \\n**Short Description**: Fungi dataset from Denmark  \\n**\\\\# Classes**: 25  \\n**\\\\# Images**: 1000  \\n**Keywords**: fungi, ecology, plants  \\n**Data Format**: images  \\n**Image size**: 128x128  \\n\\n**License (original data release)**: BSD-3-Clause License  \\n**License URL(original data release)**: https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE\\n \\n**License (Meta-Album data release)**: BSD-3-Clause License  \\n**License URL (Meta-Album data release)**: [https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE](https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE)', metadata={'did': 44302, 'name': 'Meta_Album_FNG_Mini'}),\n Document(page_content=\"### Description\\n\\nThis dataset describes mushrooms in terms of their physical characteristics. They are classified into: poisonous or edible.\\n\\n### Source\\n```\\n(a) Origin: \\nMushroom records are drawn from The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf \\n\\n(b) Donor: \\nJeff Schlimmer (Jeffrey.Schlimmer '@' a.gp.cs.cmu.edu)\\n```\\n\\n### Dataset description\\n\\nThis dataset includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family. Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like ``leaflets three, let it be'' for Poisonous Oak and Ivy.\", metadata={'did': 24, 'name': 'mushroom'}),\n Document(page_content='### **Dataset Details**\\n![](https://meta-album.github.io/assets/img/samples/FNG.png)\\n\\n**Meta Album ID**: PLT.FNG  \\n**Meta Album URL**: [https://meta-album.github.io/datasets/FNG.html](https://meta-album.github.io/datasets/FNG.html)  \\n**Domain ID**: PLT  \\n**Domain Name**: Plants  \\n**Dataset ID**: FNG  \\n**Dataset Name**: Fungi  \\n**Short Description**: Fungi dataset from Denmark  \\n**\\\\# Classes**: 20  \\n**\\\\# Images**: 800  \\n**Keywords**: fungi, ecology, plants  \\n**Data Format**: images  \\n**Image size**: 128x128  \\n\\n**License (original data release)**: BSD-3-Clause License  \\n**License URL(original data release)**: https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE\\n \\n**License (Meta-Album data release)**: BSD-3-Clause License  \\n**License URL (Meta-Album data release)**: [https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE](https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE)', metadata={'did': 44272, 'name': 'Meta_Album_FNG_Micro'}),\n Document(page_content='**Source**: Danish Fungi Dataset  \\n**Source URL**: https://sites.google.com/view/danish-fungi-dataset  \\n  \\n**Original Author**: Lukas Picek, Milan Sulc, Jiri Matas, Jacob Heilmann-Clausen, Thomas S. Jeppesen, Thomas Laessoe, Tobias Froslev  \\n**Original contact**: lukaspicek@gmail.com  \\n\\n**Meta Album author**: Felix Herron  \\n**Created Date**: 01 March 2022  \\n**Contact Name**: Ihsan Ullah  \\n**Contact Email**: meta-album@chalearn.org  \\n**Contact URL**: [https://meta-album.github.io/](https://meta-album.github.io/)  \\n\\n\\n\\n### **Cite this dataset**\\n```\\n@article{picek2021danish,\\n    title={Danish Fungi 2020 - Not Just Another Image Recognition Dataset},\\n    author={Lukas Picek and Milan Sulc and Jiri Matas and Jacob Heilmann-Clausen and Thomas S. Jeppesen and Thomas Laessoe and Tobias Froslev},\\n    year={2021},\\n    eprint={2103.10107},\\n    archivePrefix={arXiv},\\n    primaryClass={cs.CV}\\n}\\n```', metadata={'did': 44272, 'name': 'Meta_Album_FNG_Micro'}),\n Document(page_content='did - 24, name - mushroom, version - 1, uploader - 1, status - active, format - ARFF, MajorityClassSize - 4208.0, MaxNominalAttDistinctValues - 12.0, MinorityClassSize - 3916.0, NumberOfClasses - 2.0, NumberOfFeatures - 23.0, NumberOfInstances - 8124.0, NumberOfInstancesWithMissingValues - 2480.0, NumberOfMissingValues - 2480.0, NumberOfNumericFeatures - 0.0, NumberOfSymbolicFeatures - 23.0, description - **Author**: [Jeff Schlimmer](Jeffrey.Schlimmer@a.gp.cs.cmu.edu)  \\n**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/mushroom) - 1981     \\n**Please cite**:  The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf \\n\\n\\n### Description\\n\\nThis dataset describes mushrooms in terms of their physical characteristics. They are classified into: poisonous or edible.', metadata={'did': 24, 'name': 'mushroom'}),\n Document(page_content='Meta-Album Fungi dataset is created by sampling the Danish Fungi 2020 dataset(https://arxiv.org/abs/2103.10107), itself a sampling of the Atlas of Danish Fungi repository. The images and labels which enter this database are sourced by a group consisting of 3 300 citizen botanists, then verified by their peers using a ranking of each person reliability, then finally verified by experts working at the Atlas. Of the 128 classes in the original Danish Fungi 2020 dataset, FNG retains the 25 most populous classes, belonging to six genera, for a total of 15 122 images total, with min 372, and max 1 221 images per class. Each image contains a colored 128x128 image of a fungus or a piece of a fungus from the corresponding class. Because the initial data were of widely varying sizes, we needed to crop a significant portion of the images, which we implemented by taking the largest possible square with center at the middle of the initial image. We then scaled each squared image to the 128x128', metadata={'did': 44272, 'name': 'Meta_Album_FNG_Micro'}),\n Document(page_content='did - 44272, name - Meta_Album_FNG_Micro, version - 1, uploader - 30980, status - active, format - arff, MajorityClassSize - 40.0, MaxNominalAttDistinctValues - nan, MinorityClassSize - 40.0, NumberOfClasses - 20.0, NumberOfFeatures - 3.0, NumberOfInstances - 800.0, NumberOfInstancesWithMissingValues - 0.0, NumberOfMissingValues - 0.0, NumberOfNumericFeatures - 0.0, NumberOfSymbolicFeatures - 0.0, description - ## **Meta-Album Fungi Dataset (Micro)**\\n***', metadata={'did': 44272, 'name': 'Meta_Album_FNG_Micro'}),\n Document(page_content='did - 44335, name - Meta_Album_FNG_Extended, version - 1, uploader - 30980, status - active, format - arff, MajorityClassSize - 1221.0, MaxNominalAttDistinctValues - nan, MinorityClassSize - 372.0, NumberOfClasses - 25.0, NumberOfFeatures - 3.0, NumberOfInstances - 15122.0, NumberOfInstancesWithMissingValues - 0.0, NumberOfMissingValues - 0.0, NumberOfNumericFeatures - 0.0, NumberOfSymbolicFeatures - 0.0, description - ## **Meta-Album Fungi Dataset (Extended)**\\n***', metadata={'did': 44335, 'name': 'Meta_Album_FNG_Extended'}),\n Document(page_content='did - 43922, name - mushroom, version - 3, uploader - 30861, status - active, format - ARFF, MajorityClassSize - 4208.0, MaxNominalAttDistinctValues - nan, MinorityClassSize - 3916.0, NumberOfClasses - 2.0, NumberOfFeatures - 23.0, NumberOfInstances - 8124.0, NumberOfInstancesWithMissingValues - 0.0, NumberOfMissingValues - 0.0, NumberOfNumericFeatures - 0.0, NumberOfSymbolicFeatures - 23.0, description - Mushroom records drawn from The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf, qualities - AutoCorrelation : 0.726332635725717, Dimensionality : 0.002831117676021664, MajorityClassPercentage : 51.7971442639094, MajorityClassSize : 4208.0, MinorityClassPercentage : 48.20285573609059, MinorityClassSize : 3916.0, NumberOfBinaryFeatures : 6.0, NumberOfClasses : 2.0, NumberOfFeatures : 23.0, NumberOfInstances : 8124.0, NumberOfInstancesWithMissingValues : 0.0, NumberOfMissingValues : 0.0, NumberOfNumericFeatures : 0.0,', metadata={'did': 43922, 'name': 'mushroom'})]</code>\n</pre> <pre><code>res[0].metadata\n</code></pre> <pre>\n<code>{'did': 44335, 'name': 'Meta_Album_FNG_Extended'}</code>\n</pre> <pre><code>print(res[0].page_content)\n</code></pre> <pre>\n<code>### **Dataset Details**\n![](https://meta-album.github.io/assets/img/samples/FNG.png)\n\n**Meta Album ID**: PLT.FNG  \n**Meta Album URL**: [https://meta-album.github.io/datasets/FNG.html](https://meta-album.github.io/datasets/FNG.html)  \n**Domain ID**: PLT  \n**Domain Name**: Plants  \n**Dataset ID**: FNG  \n**Dataset Name**: Fungi  \n**Short Description**: Fungi dataset from Denmark  \n**\\# Classes**: 25  \n**\\# Images**: 15122  \n**Keywords**: fungi, ecology, plants  \n**Data Format**: images  \n**Image size**: 128x128  \n\n**License (original data release)**: BSD-3-Clause License  \n**License URL(original data release)**: https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE\n\n**License (Meta-Album data release)**: BSD-3-Clause License  \n**License URL (Meta-Album data release)**: [https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE](https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE)\n</code>\n</pre> <pre><code># Fetch the result data frame based on the query\nresult_data_frame, result_documents = get_result_from_query(\n    query=query, qa=qa_dataset, type_of_query=\"dataset\", config=config\n)\n</code></pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre><code>result_data_frame.head()\n</code></pre> id name command OpenML URL Description 0 44335 Meta_Album_FNG_Extended dataset = openml.datasets.get_dataset(44335) &lt;a href=\"https://www.openml.org/search?type=da... did - 44335, name - Meta_Album_FNG_Extended, v... 1 44302 Meta_Album_FNG_Mini dataset = openml.datasets.get_dataset(44302) &lt;a href=\"https://www.openml.org/search?type=da... ### **Dataset Details**\\n![](https://meta-albu... 2 24 mushroom dataset = openml.datasets.get_dataset(24) &lt;a href=\"https://www.openml.org/search?type=da... did - 24, name - mushroom, version - 1, upload... 3 44272 Meta_Album_FNG_Micro dataset = openml.datasets.get_dataset(44272) &lt;a href=\"https://www.openml.org/search?type=da... did - 44272, name - Meta_Album_FNG_Micro, vers... 10 1558 bank-marketing dataset = openml.datasets.get_dataset(1558) &lt;a href=\"https://www.openml.org/search?type=da... * Dataset:"},{"location":"developer%20tutorials/load%20vectordb%20and%20get%20results/#load-the-chroma-db-and-get-retrieval-results-for-a-given-query","title":"Load the Chroma Db and get retrieval results for a given query","text":"<ul> <li>How would you load the Chroma Db and get retrieval results for a given query?</li> </ul>"},{"location":"developer%20tutorials/load%20vectordb%20and%20get%20results/#just-get-documents","title":"Just get documents","text":""},{"location":"developer%20tutorials/load%20vectordb%20and%20get%20results/#process-the-results-and-return-a-dataframe-instead","title":"Process the results and return a dataframe instead","text":""},{"location":"developer%20tutorials/run%20multiple%20queries%20and%20aggregate/","title":"Run multiple queries and aggregate","text":"<pre><code>from __future__ import annotations\nfrom langchain.globals import set_llm_cache\nfrom langchain_community.cache import SQLiteCache\nimport os\nimport sys\nimport chromadb\n# change the path to the backend directory\nsys.path.append(os.path.join(os.path.dirname(\".\"), '../../backend/'))\n</code></pre> <pre><code>from modules.utils import load_config_and_device\nfrom modules.llm import setup_vector_db_and_qa\nfrom modules.results_gen import aggregate_multiple_queries_and_count\n</code></pre> <pre><code># Config and DB\n\n# load the configuration and device\nconfig = load_config_and_device(\"../../backend/config.json\")\nconfig[\"persist_dir\"] = \"../../backend/data/chroma_db/\"\nconfig[\"data_dir\"] = \"../../backend/data/\"\n</code></pre> <pre>\n<code>[INFO] Finding device.\n[INFO] Device found: cpu\n</code>\n</pre> <pre><code># load the persistent database using ChromaDB\nclient = chromadb.PersistentClient(path=config[\"persist_dir\"])\n</code></pre> <pre><code># Setup llm chain, initialize the retriever and llm, and setup Retrieval QA\nqa_dataset = setup_vector_db_and_qa(config=config, data_type=\"dataset\", client=client)\n</code></pre> <pre>\n<code>[INFO] Loading metadata from file.\n[INFO] Loading model...\n</code>\n</pre> <pre>\n<code>/Users/eragon/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n  warnings.warn(\n</code>\n</pre> <pre>\n<code>[INFO] Model loaded.\n</code>\n</pre> <pre><code>queries = [\"Find datasets related to COVID-19\", \"Find datasets related to COVID-19 and India\", \"COVID-19 dataset\", \"COVID-19 dataset India\", \"Mexico historical covid\"]\ncombined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = [\"id\", \"name\"], sort_by=\"query\", count = True)\n</code></pre> <pre>\n<code>  0%|          | 0/5 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code> 20%|\u2588\u2588        | 1/5 [00:02&lt;00:08,  2.14s/it]</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code> 40%|\u2588\u2588\u2588\u2588      | 2/5 [00:02&lt;00:03,  1.17s/it]</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code> 60%|\u2588\u2588\u2588\u2588\u2588\u2588    | 3/5 [00:03&lt;00:01,  1.14it/s]</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code> 80%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588  | 4/5 [00:03&lt;00:00,  1.32it/s]</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 5/5 [00:03&lt;00:00,  1.26it/s]\n</code>\n</pre> <pre><code>combined_df.head()\n</code></pre> id name query 36 43495 COVID-19-Mexico-Clean--Order-by-States 4 52 43844 Coronavirus-Worldwide-Dataset 4 26 43349 COVID-19-World-Vaccination-Progress 4 27 43365 Covid-19-Case-Surveillance-Public-Use-Dataset 4 28 43367 COVID-19-Indonesia-Dataset 4 <pre><code>queries = [\"Find datasets related to COVID-19\", \"Find datasets related to COVID-19 and India\", \"COVID-19 dataset\", \"COVID-19 dataset India\", \"Mexico historical covid\"]\ncombined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = [\"id\", \"name\"], sort_by=\"query\", count = False)\n</code></pre> <pre>\n<code>  0%|          | 0/5 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code> 20%|\u2588\u2588        | 1/5 [00:00&lt;00:02,  1.43it/s]</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code> 40%|\u2588\u2588\u2588\u2588      | 2/5 [00:01&lt;00:01,  1.99it/s]</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code> 60%|\u2588\u2588\u2588\u2588\u2588\u2588    | 3/5 [00:01&lt;00:01,  1.50it/s]</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code> 80%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588  | 4/5 [00:02&lt;00:00,  1.85it/s]</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 5/5 [00:02&lt;00:00,  2.01it/s]\n</code>\n</pre> <pre><code>combined_df.head()\n</code></pre> id name query 0 43410 Coronavirus-Disease-(COVID-19) Find datasets related to COVID-19 1 43412 COVID-19-Visualisation-and-Epidemic-Analysis-Data Find datasets related to COVID-19 2 43365 Covid-19-Case-Surveillance-Public-Use-Dataset Find datasets related to COVID-19 3 43367 COVID-19-Indonesia-Dataset Find datasets related to COVID-19 4 43684 COVID-19-Stats-and-Mobility-Trends Find datasets related to COVID-19 <pre>\nThe Kernel crashed while executing code in the current cell or a previous cell. \n\nPlease review the code in the cell(s) to identify a possible cause of the failure. \n\nClick &lt;a href='https://aka.ms/vscodeJupyterKernelCrash'&gt;here&lt;/a&gt; for more info. \n\nView Jupyter &lt;a href='command:jupyter.viewOutput'&gt;log&lt;/a&gt; for further details.</pre>"},{"location":"developer%20tutorials/run%20multiple%20queries%20and%20aggregate/#aggregate-results","title":"Aggregate results","text":""},{"location":"developer%20tutorials/run%20multiple%20queries%20and%20aggregate/#just-collate","title":"Just collate","text":""},{"location":"developer%20tutorials/train%20and%20evaluate%20models/","title":"Train and evaluate models","text":"<pre><code>from __future__ import annotations\nfrom langchain_community.cache import SQLiteCache\nimport os\nimport sys\nimport chromadb\nfrom pathlib import Path\nfrom tqdm import tqdm\n\nimport pandas as pd\n# change the path to the backend directory\nsys.path.append(os.path.join(os.path.dirname(\".\"), '../../backend/'))\n</code></pre> <pre><code>from modules.utils import load_config_and_device\nfrom modules.llm import setup_vector_db_and_qa\nfrom modules.results_gen import aggregate_multiple_queries_and_count\n</code></pre> <pre><code>new_path = Path(\"../../backend/\")\n\nconfig = load_config_and_device(str(new_path / \"config.json\"), training = True)\n\nconfig[\"type_of_data\"] = \"dataset\"\nconfig[\"training\"] = True\n</code></pre> <pre>\n<code>[INFO] Finding device.\n[INFO] Device found: mps\n</code>\n</pre> <pre><code>config[\"device\"] = \"cpu\" # for testing\n</code></pre> <pre><code>list_of_embedding_models = [\"BAAI/bge-small-en-v1.5\"]\nlist_of_llm_models = [\"qwen2:1.5b\", \"phi3\"]\n</code></pre> <pre><code>def process_embedding_model_name_hf(name : str) -&amp;gt; str:\n    \"\"\"\n    Description: This function processes the name of the embedding model from Hugging Face to use as experiment name.\n\n    Input: name (str) - name of the embedding model from Hugging Face.\n\n    Returns: name (str) - processed name of the embedding model.\n    \"\"\"\n    return name.replace(\"/\", \"_\")\n\ndef process_llm_model_name_ollama(name : str) -&amp;gt; str:\n    \"\"\"\n    Description: This function processes the name of the llm model from Ollama to use as experiment name.\n\n    Input: name (str) - name of the llm model from Ollama.\n\n    Returns: name (str) - processed name of the llm model.\n    \"\"\"\n    return name.replace(\":\", \"_\")\n</code></pre> <pre><code>queries = [\"Find datasets related to COVID-19\", \"Find datasets related to COVID-19 and India\", \"COVID-19 dataset\", \"COVID-19 dataset India\", \"Mexico historical covid\"]\n</code></pre> <pre><code># download the ollama llm models\n\n# os.system(\"curl -fsSL https://ollama.com/install.sh | sh\")\nos.system(\"ollama serve&amp;amp;\")\nprint(\"Waiting for Ollama server to be active...\")  \nwhile os.system(\"ollama list | grep 'NAME'\") == \"\":\n    pass\n\nfor llm_model in list_of_llm_models:\n    os.system(f\"ollama pull {llm_model}\")\n</code></pre> <pre>\n<code>Waiting for Ollama server to be active...\nNAME            ID              SIZE    MODIFIED       \n</code>\n</pre> <pre>\n<code>Error: listen tcp 127.0.0.1:11434: bind: address already in use\npulling manifest \u280b pulling manifest \u2819 pulling manifest \u2839 pulling manifest \u2838 pulling manifest \u283c pulling manifest \u2834 pulling manifest \npulling 405b56374e02... 100% \u2595\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u258f 934 MB                         \npulling 62fbfd9ed093... 100% \u2595\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u258f  182 B                         \npulling c156170b718e... 100% \u2595\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u258f  11 KB                         \npulling f02dd72bb242... 100% \u2595\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u258f   59 B                         \npulling c9f5e9ffbc5f... 100% \u2595\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u258f  485 B                         \nverifying sha256 digest \nwriting manifest \nremoving any unused layers \nsuccess \npulling manifest \u280b pulling manifest \u2819 pulling manifest \u2839 pulling manifest \u2838 pulling manifest \u283c pulling manifest \npulling b26e6713dc74... 100% \u2595\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u258f 2.4 GB                         \npulling fa8235e5b48f... 100% \u2595\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u258f 1.1 KB                         \npulling 542b217f179c... 100% \u2595\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u258f  148 B                         \npulling 8dde1baf1db0... 100% \u2595\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u258f   78 B                         \npulling f91db7a2deb9... 100% \u2595\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u258f  485 B                         \nverifying sha256 digest \nwriting manifest \nremoving any unused layers \nsuccess \n</code>\n</pre> <pre><code># use a tiny subset of the data for testing\nconfig[\"test_subset_2000\"] = True\n</code></pre> <pre><code>for embedding_model in tqdm(list_of_embedding_models, desc=\"Embedding Models\", total=len(list_of_embedding_models)):\n    for llm_model in tqdm(list_of_llm_models, desc=\"LLM Models\", total=len(list_of_llm_models)):\n        # update the config with the new embedding and llm models\n        config[\"embedding_model\"] = embedding_model\n        config[\"llm_model\"] = llm_model\n\n        # create a new experiment directory using a combination of the embedding model and llm model names\n        experiment_name = f\"{process_embedding_model_name_hf(embedding_model)}_{process_llm_model_name_ollama(llm_model)}\"\n        experiment_path = new_path/Path(f\"data/experiments/{experiment_name}\")\n\n        # create the experiment directory if it does not exist\n        os.makedirs(experiment_path, exist_ok=True)\n\n        # update the config with the new experiment directories\n        config[\"data_dir\"] = str(experiment_path)\n        config[\"persist_dir\"] = str(experiment_path / \"chroma_db\")\n\n        # save training details and config in a dataframe\n        config_df = pd.DataFrame.from_dict(config, orient='index').reset_index()\n        config_df.columns = ['Hyperparameter', 'Value']\n        config_df.to_csv(experiment_path / \"config.csv\", index=False)\n\n        # load the persistent database using ChromaDB\n        client = chromadb.PersistentClient(path=config[\"persist_dir\"])\n\n        # Run \"training\"\n        qa_dataset = setup_vector_db_and_qa(\n            config=config, data_type=config[\"type_of_data\"], client=client\n        )\n\n        # Run an evaluation by aggregating multiple queries and counting the results\n        # TODO : Replace this evaluation with a more meaningful one\n        combined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = [\"id\", \"name\"], sort_by=\"query\", count = True)\n\n        combined_df.to_csv(experiment_path / \"results.csv\")\n</code></pre> <pre>\n<code>Embedding Models:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>[INFO] Training is set to True.\n[INFO] Subsetting the data to 100 rows.\n[INFO] Initializing cache.\n[INFO] Getting dataset metadata from OpenML.\n</code>\n</pre> <pre>\n<code>QUEUEING TASKS | :   0%|          | 0/100 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>PROCESSING TASKS | :   0%|          | 0/100 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>COLLECTING RESULTS | :   0%|          | 0/100 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>[INFO] Saving metadata to file.\n[INFO] Loading model...\n</code>\n</pre> <pre>\n<code>/Users/eragon/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n  warnings.warn(\n</code>\n</pre> <pre>\n<code>[INFO] Model loaded.\n[INFO] Generating unique documents. Total documents: 992\nNumber of unique documents: 967 vs Total documents: 992\n</code>\n</pre> <pre>\n<code>\n</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/16 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>\n</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/15 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>\n100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [03:06&lt;00:00, 93.04s/it]\n\n</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>\n</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>\n</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>\n100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 5/5 [00:00&lt;00:00,  8.16it/s]\n</code>\n</pre> <pre>\n<code>[INFO] Training is set to True.\n[INFO] Subsetting the data to 100 rows.\n[INFO] Initializing cache.\n[INFO] Getting dataset metadata from OpenML.\n</code>\n</pre> <pre>\n<code>QUEUEING TASKS | :   0%|          | 0/100 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>PROCESSING TASKS | :   0%|          | 0/100 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>COLLECTING RESULTS | :   0%|          | 0/100 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>[INFO] Saving metadata to file.\n[INFO] Loading model...\n</code>\n</pre> <pre>\n<code>/Users/eragon/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n  warnings.warn(\n</code>\n</pre> <pre>\n<code>[INFO] Model loaded.\n[INFO] Generating unique documents. Total documents: 992\nNumber of unique documents: 967 vs Total documents: 992\n</code>\n</pre> <pre>\n<code>\n</code>\n</pre> <pre>\n<code>Batches:   0%|          | 0/16 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>  0%|          | 0/2 [00:43&lt;?, ?it/s]\nLLM Models:  50%|\u2588\u2588\u2588\u2588\u2588     | 1/2 [04:37&lt;04:37, 277.59s/it]\nEmbedding Models:   0%|          | 0/1 [04:37&lt;?, ?it/s]\n</code>\n</pre> <pre>\n---------------------------------------------------------------------------\nKeyboardInterrupt                         Traceback (most recent call last)\nCell In[11], line 27\n     24 client = chromadb.PersistentClient(path=config[\"persist_dir\"])\n     26 # Run \"training\"\n---&gt; 27 qa_dataset = setup_vector_db_and_qa(\n     28     config=config, data_type=config[\"type_of_data\"], client=client\n     29 )\n     31 # Run an evaluation by aggregating multiple queries and counting the results\n     32 # TODO : Replace this evaluation with a more meaningful one\n     33 combined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = [\"id\", \"name\"], sort_by=\"query\", count = True)\n\nFile ~/Documents/CODE/Github/ai_search/docs/developer tutorials/../../backend/modules/llm.py:268, in setup_vector_db_and_qa(config, data_type, client)\n    264 metadata_df, all_metadata = create_metadata_dataframe(\n    265     openml_data_object, data_id, all_metadata, config=config\n    266 )\n    267 # Create the vector store\n--&gt; 268 vectordb = load_document_and_create_vector_store(\n    269     metadata_df, config=config, chroma_client=client\n    270 )\n    271 # Initialize the LLM chain and setup Retrieval QA\n    272 qa = initialize_llm_chain(vectordb=vectordb, config=config)\n\nFile ~/Documents/CODE/Github/ai_search/docs/developer tutorials/../../backend/modules/llm.py:107, in load_document_and_create_vector_store(metadata_df, chroma_client, config)\n    104 if not config[\"training\"]:\n    105     return load_vector_store(chroma_client, config, embeddings, collection_name)\n--&gt; 107 return create_vector_store(\n    108     metadata_df, chroma_client, config, embeddings, collection_name\n    109 )\n\nFile ~/Documents/CODE/Github/ai_search/docs/developer tutorials/../../backend/modules/llm.py:222, in create_vector_store(metadata_df, chroma_client, config, embeddings, collection_name)\n    219     return db\n    220 else:\n    221     # db.add_documents(unique_docs, ids=unique_ids)\n--&gt; 222     add_documents_to_db(db, unique_docs, unique_ids)\n    224 return db\n\nFile ~/Documents/CODE/Github/ai_search/docs/developer tutorials/../../backend/modules/llm.py:180, in add_documents_to_db(db, unique_docs, unique_ids)\n    178 else:\n    179     for i in tqdm(range(0, len(unique_docs), bs)):\n--&gt; 180         db.add_documents(unique_docs[i : i + bs], ids=unique_ids[i : i + bs])\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/langchain_core/vectorstores.py:147, in VectorStore.add_documents(self, documents, **kwargs)\n    145 texts = [doc.page_content for doc in documents]\n    146 metadatas = [doc.metadata for doc in documents]\n--&gt; 147 return self.add_texts(texts, metadatas, **kwargs)\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/langchain_community/vectorstores/chroma.py:276, in Chroma.add_texts(self, texts, metadatas, ids, **kwargs)\n    274 texts = list(texts)\n    275 if self._embedding_function is not None:\n--&gt; 276     embeddings = self._embedding_function.embed_documents(texts)\n    277 if metadatas:\n    278     # fill metadatas with empty dicts if somebody\n    279     # did not specify metadata for all texts\n    280     length_diff = len(texts) - len(metadatas)\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/langchain_community/embeddings/huggingface.py:98, in HuggingFaceEmbeddings.embed_documents(self, texts)\n     96     sentence_transformers.SentenceTransformer.stop_multi_process_pool(pool)\n     97 else:\n---&gt; 98     embeddings = self.client.encode(\n     99         texts, show_progress_bar=self.show_progress, **self.encode_kwargs\n    100     )\n    102 return embeddings.tolist()\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/sentence_transformers/SentenceTransformer.py:371, in SentenceTransformer.encode(self, sentences, prompt_name, prompt, batch_size, show_progress_bar, output_value, precision, convert_to_numpy, convert_to_tensor, device, normalize_embeddings)\n    368 features.update(extra_features)\n    370 with torch.no_grad():\n--&gt; 371     out_features = self.forward(features)\n    372     out_features[\"sentence_embedding\"] = truncate_embeddings(\n    373         out_features[\"sentence_embedding\"], self.truncate_dim\n    374     )\n    376     if output_value == \"token_embeddings\":\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/container.py:217, in Sequential.forward(self, input)\n    215 def forward(self, input):\n    216     for module in self:\n--&gt; 217         input = module(input)\n    218     return input\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)\n   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]\n   1531 else:\n-&gt; 1532     return self._call_impl(*args, **kwargs)\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)\n   1536 # If we don't have any hooks, we want to skip the rest of the logic in\n   1537 # this function, and just call forward.\n   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks\n   1539         or _global_backward_pre_hooks or _global_backward_hooks\n   1540         or _global_forward_hooks or _global_forward_pre_hooks):\n-&gt; 1541     return forward_call(*args, **kwargs)\n   1543 try:\n   1544     result = None\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/sentence_transformers/models/Transformer.py:98, in Transformer.forward(self, features)\n     95 if \"token_type_ids\" in features:\n     96     trans_features[\"token_type_ids\"] = features[\"token_type_ids\"]\n---&gt; 98 output_states = self.auto_model(**trans_features, return_dict=False)\n     99 output_tokens = output_states[0]\n    101 features.update({\"token_embeddings\": output_tokens, \"attention_mask\": features[\"attention_mask\"]})\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)\n   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]\n   1531 else:\n-&gt; 1532     return self._call_impl(*args, **kwargs)\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)\n   1536 # If we don't have any hooks, we want to skip the rest of the logic in\n   1537 # this function, and just call forward.\n   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks\n   1539         or _global_backward_pre_hooks or _global_backward_hooks\n   1540         or _global_forward_hooks or _global_forward_pre_hooks):\n-&gt; 1541     return forward_call(*args, **kwargs)\n   1543 try:\n   1544     result = None\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py:1137, in BertModel.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\n   1130 # Prepare head mask if needed\n   1131 # 1.0 in head_mask indicate we keep the head\n   1132 # attention_probs has shape bsz x n_heads x N x N\n   1133 # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]\n   1134 # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]\n   1135 head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)\n-&gt; 1137 encoder_outputs = self.encoder(\n   1138     embedding_output,\n   1139     attention_mask=extended_attention_mask,\n   1140     head_mask=head_mask,\n   1141     encoder_hidden_states=encoder_hidden_states,\n   1142     encoder_attention_mask=encoder_extended_attention_mask,\n   1143     past_key_values=past_key_values,\n   1144     use_cache=use_cache,\n   1145     output_attentions=output_attentions,\n   1146     output_hidden_states=output_hidden_states,\n   1147     return_dict=return_dict,\n   1148 )\n   1149 sequence_output = encoder_outputs[0]\n   1150 pooled_output = self.pooler(sequence_output) if self.pooler is not None else None\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)\n   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]\n   1531 else:\n-&gt; 1532     return self._call_impl(*args, **kwargs)\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)\n   1536 # If we don't have any hooks, we want to skip the rest of the logic in\n   1537 # this function, and just call forward.\n   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks\n   1539         or _global_backward_pre_hooks or _global_backward_hooks\n   1540         or _global_forward_hooks or _global_forward_pre_hooks):\n-&gt; 1541     return forward_call(*args, **kwargs)\n   1543 try:\n   1544     result = None\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py:690, in BertEncoder.forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\n    679     layer_outputs = self._gradient_checkpointing_func(\n    680         layer_module.__call__,\n    681         hidden_states,\n   (...)\n    687         output_attentions,\n    688     )\n    689 else:\n--&gt; 690     layer_outputs = layer_module(\n    691         hidden_states,\n    692         attention_mask,\n    693         layer_head_mask,\n    694         encoder_hidden_states,\n    695         encoder_attention_mask,\n    696         past_key_value,\n    697         output_attentions,\n    698     )\n    700 hidden_states = layer_outputs[0]\n    701 if use_cache:\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)\n   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]\n   1531 else:\n-&gt; 1532     return self._call_impl(*args, **kwargs)\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)\n   1536 # If we don't have any hooks, we want to skip the rest of the logic in\n   1537 # this function, and just call forward.\n   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks\n   1539         or _global_backward_pre_hooks or _global_backward_hooks\n   1540         or _global_forward_hooks or _global_forward_pre_hooks):\n-&gt; 1541     return forward_call(*args, **kwargs)\n   1543 try:\n   1544     result = None\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py:622, in BertLayer.forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\n    619     cross_attn_present_key_value = cross_attention_outputs[-1]\n    620     present_key_value = present_key_value + cross_attn_present_key_value\n--&gt; 622 layer_output = apply_chunking_to_forward(\n    623     self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output\n    624 )\n    625 outputs = (layer_output,) + outputs\n    627 # if decoder, return the attn key/values as the last output\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/transformers/pytorch_utils.py:238, in apply_chunking_to_forward(forward_fn, chunk_size, chunk_dim, *input_tensors)\n    235     # concatenate output at same dimension\n    236     return torch.cat(output_chunks, dim=chunk_dim)\n--&gt; 238 return forward_fn(*input_tensors)\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py:635, in BertLayer.feed_forward_chunk(self, attention_output)\n    633 def feed_forward_chunk(self, attention_output):\n    634     intermediate_output = self.intermediate(attention_output)\n--&gt; 635     layer_output = self.output(intermediate_output, attention_output)\n    636     return layer_output\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)\n   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]\n   1531 else:\n-&gt; 1532     return self._call_impl(*args, **kwargs)\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)\n   1536 # If we don't have any hooks, we want to skip the rest of the logic in\n   1537 # this function, and just call forward.\n   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks\n   1539         or _global_backward_pre_hooks or _global_backward_hooks\n   1540         or _global_forward_hooks or _global_forward_pre_hooks):\n-&gt; 1541     return forward_call(*args, **kwargs)\n   1543 try:\n   1544     result = None\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py:547, in BertOutput.forward(self, hidden_states, input_tensor)\n    546 def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -&gt; torch.Tensor:\n--&gt; 547     hidden_states = self.dense(hidden_states)\n    548     hidden_states = self.dropout(hidden_states)\n    549     hidden_states = self.LayerNorm(hidden_states + input_tensor)\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)\n   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]\n   1531 else:\n-&gt; 1532     return self._call_impl(*args, **kwargs)\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)\n   1536 # If we don't have any hooks, we want to skip the rest of the logic in\n   1537 # this function, and just call forward.\n   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks\n   1539         or _global_backward_pre_hooks or _global_backward_hooks\n   1540         or _global_forward_hooks or _global_forward_pre_hooks):\n-&gt; 1541     return forward_call(*args, **kwargs)\n   1543 try:\n   1544     result = None\n\nFile ~/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/torch/nn/modules/linear.py:116, in Linear.forward(self, input)\n    115 def forward(self, input: Tensor) -&gt; Tensor:\n--&gt; 116     return F.linear(input, self.weight, self.bias)\n\nKeyboardInterrupt: </pre>"},{"location":"developer%20tutorials/train%20and%20evaluate%20models/#tutorial-on-using-multiple-models-for-evaluation","title":"Tutorial on using multiple models for evaluation","text":"<ul> <li>This tutorial is an example of how to test multiple models on the openml data to see which one performs the best.</li> <li>The evaluation is still a bit basic, but it is a good starting point for future research.</li> </ul>"},{"location":"developer%20tutorials/train%20and%20evaluate%20models/#setting-the-config","title":"Setting the config","text":""},{"location":"developer%20tutorials/train%20and%20evaluate%20models/#defining-the-models-used","title":"Defining the models used","text":"<ul> <li>Embedding models are any from Huggingface hub</li> <li>LLM models are any from Ollama library</li> </ul>"},{"location":"developer%20tutorials/train%20and%20evaluate%20models/#defining-the-evaluation-queries","title":"Defining the evaluation queries","text":"<ul> <li>replace this with a proper dataframe for a more comprehensive evaluation</li> </ul>"},{"location":"developer%20tutorials/train%20and%20evaluate%20models/#downloading-the-models","title":"Downloading the models","text":"<ul> <li>PLEASE MAKE SURE YOU HAVE DOWNLOADED OLLAMA (<code>curl -fsSL https://ollama.com/install.sh | sh</code>)</li> </ul>"},{"location":"developer%20tutorials/train%20and%20evaluate%20models/#running-the-steps","title":"Running the steps","text":"<ul> <li>Create an experiment directory</li> <li>Save a config file with the models and the queries in the experiment directory</li> <li>Download openml data for each dataset and format into a string</li> <li>Create vectorb and embed the data</li> <li>Get the predictions for each model for a list of queries and evaluate the performance</li> <li>(note) At the moment, this runs for a very small subset of the entire data. To disable this behavior and run on the entire data, set <code>config[\"test_subset_2000\"] = False</code></li> </ul>"},{"location":"modules/general_utils/","title":"General utils","text":""},{"location":"modules/general_utils/#general_utils.find_device","title":"<code>find_device(training=False)</code>","text":"<p>Description: Find the device to use for the pipeline. If cuda is available, use it. If not, check if MPS is available and use it. If not, use CPU.</p> <p>Input: training (bool) : Whether the pipeline is being used for training or not.</p> <p>Returns: device (str) : The device to use for the pipeline.</p> Source code in <code>backend/modules/general_utils.py</code> <pre><code>def find_device(training: bool = False ) -&gt; str:\n    \"\"\"\n    Description: Find the device to use for the pipeline. If cuda is available, use it. If not, check if MPS is available and use it. If not, use CPU.\n\n    Input: training (bool) : Whether the pipeline is being used for training or not.\n\n    Returns: device (str) : The device to use for the pipeline.\n    \"\"\"\n    print(\"[INFO] Finding device.\")\n    if torch.cuda.is_available():\n        return \"cuda\"\n    elif torch.backends.mps.is_available():\n        if training == False:\n            # loading metadata on mps for inference is quite slow. So disabling for now.\n            return \"cpu\"\n        return \"mps\"\n    else:\n        return \"cpu\"\n</code></pre>"},{"location":"modules/general_utils/#general_utils.load_config_and_device","title":"<code>load_config_and_device(config_file, training=False)</code>","text":"<p>Description: Load the config file and find the device to use for the pipeline.</p> <p>Input: config_file (str) : The path to the config file. training (bool) : Whether the pipeline is being used for training or not.</p> <p>Returns: config (dict) : The config dictionary + device (str) : The device to use for the pipeline.</p> Source code in <code>backend/modules/general_utils.py</code> <pre><code>def load_config_and_device(config_file: str, training: bool = False) -&gt; dict:\n    \"\"\"\n    Description: Load the config file and find the device to use for the pipeline.\n\n    Input: config_file (str) : The path to the config file.\n    training (bool) : Whether the pipeline is being used for training or not.\n\n    Returns: config (dict) : The config dictionary + device (str) : The device to use for the pipeline.\n    \"\"\"\n    # Check if the config file exists and load it\n    if not os.path.exists(config_file):\n        raise Exception(\"Config file does not exist.\")\n    with open(config_file, \"r\") as f:\n        config = json.load(f)\n\n    # Find device and set it in the config between cpu and cuda and mps if available\n    config[\"device\"] = find_device(training)\n    print(f\"[INFO] Device found: {config['device']}\")\n    return config\n</code></pre>"},{"location":"modules/llm_module/","title":"Llm module","text":""},{"location":"modules/llm_module/#llm.add_documents_to_db","title":"<code>add_documents_to_db(db, unique_docs, unique_ids)</code>","text":"<p>Description: Add documents to the vector store in batches of 200.</p> <p>Input: db (Chroma), unique_docs (list), unique_ids (list)</p> <p>Returns: None</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def add_documents_to_db(db, unique_docs, unique_ids):\n    \"\"\"\n    Description: Add documents to the vector store in batches of 200.\n\n    Input: db (Chroma), unique_docs (list), unique_ids (list)\n\n    Returns: None\n    \"\"\"\n    bs = 512\n    if len(unique_docs) &lt; bs:\n        db.add_documents(unique_docs, ids=unique_ids)\n    else:\n        for i in tqdm(range(0, len(unique_docs), bs)):\n            db.add_documents(unique_docs[i : i + bs], ids=unique_ids[i : i + bs])\n</code></pre>"},{"location":"modules/llm_module/#llm.create_vector_store","title":"<code>create_vector_store(metadata_df, chroma_client, config, embeddings, collection_name)</code>","text":"<p>Description: Create the vector store using Chroma db. The documents are loaded and processed, unique documents are generated, and the documents are added to the vector store.</p> <p>Input: metadata_df (pd.DataFrame), chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)</p> <p>Returns: db (Chroma)</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def create_vector_store(\n    metadata_df: pd.DataFrame, chroma_client:ClientAPI, config: dict, embeddings: HuggingFaceEmbeddings, collection_name: str \n) -&gt; Chroma:\n    \"\"\"\n    Description: Create the vector store using Chroma db. The documents are loaded and processed, unique documents are generated, and the documents are added to the vector store.\n\n    Input: metadata_df (pd.DataFrame), chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)\n\n    Returns: db (Chroma)\n    \"\"\"\n\n    db = Chroma(\n        client=chroma_client,\n        embedding_function=embeddings,\n        persist_directory=config[\"persist_dir\"],\n        collection_name=collection_name,\n    )\n\n    documents = load_and_process_data(\n        metadata_df, page_content_column=\"Combined_information\"\n    )\n    if config[\"testing_flag\"]:\n        # subset the data for testing\n        if config[\"test_subset_2000\"] == True:\n            print(\"[INFO] Subsetting the data to 100 rows.\")\n            documents = documents[:100]\n    unique_docs, unique_ids = generate_unique_documents(documents, db)\n\n    print(\n        f\"Number of unique documents: {len(unique_docs)} vs Total documents: {len(documents)}\"\n    )\n    if len(unique_docs) == 0:\n        print(\"No new documents to add.\")\n        return db\n    else:\n        # db.add_documents(unique_docs, ids=unique_ids)\n        add_documents_to_db(db, unique_docs, unique_ids)\n\n    return db\n</code></pre>"},{"location":"modules/llm_module/#llm.generate_unique_documents","title":"<code>generate_unique_documents(documents, db)</code>","text":"Generate unique documents by removing duplicates. This is done by generating unique IDs for the documents and keeping only one of the duplicate IDs. <p>Source: https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist</p> <p>Input: documents (list)</p> <p>Returns: unique_docs (list), unique_ids (list)</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def generate_unique_documents(documents: list, db: Chroma) -&gt; tuple:\n    \"\"\"\n    Description: Generate unique documents by removing duplicates. This is done by generating unique IDs for the documents and keeping only one of the duplicate IDs.\n        Source: https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist\n\n    Input: documents (list)\n\n    Returns: unique_docs (list), unique_ids (list)\n    \"\"\"\n\n    # Remove duplicates based on ID (from database)\n    new_document_ids = set([str(x.metadata[\"did\"]) for x in documents])\n    print(f\"[INFO] Generating unique documents. Total documents: {len(documents)}\")\n    try:\n        old_dids = set([str(x[\"did\"]) for x in db.get()[\"metadatas\"]])\n    except KeyError:\n        old_dids = set([str(x[\"id\"]) for x in db.get()[\"metadatas\"]])\n\n    new_dids = new_document_ids - old_dids\n    documents = [x for x in documents if str(x.metadata[\"did\"]) in new_dids]\n    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS,doc.page_content)) for doc in documents]\n\n    # Remove duplicates based on document content (from new documents)\n    unique_ids = list(set(ids))\n    seen_ids = set()\n    unique_docs = [\n            doc\n            for doc, id in zip(documents, ids)\n            if id not in seen_ids and (seen_ids.add(id) or True)\n        ]\n\n    return unique_docs, unique_ids\n</code></pre>"},{"location":"modules/llm_module/#llm.get_collection_name","title":"<code>get_collection_name(config)</code>","text":"<p>Description: Get the collection name based on the type of data provided in the config.</p> <p>Input: config (dict)</p> <p>Returns: str</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def get_collection_name(config: dict) -&gt; str:\n    \"\"\"\n    Description: Get the collection name based on the type of data provided in the config.\n\n    Input: config (dict)\n\n    Returns: str\n    \"\"\"\n    return {\"dataset\": \"datasets\", \"flow\": \"flows\"}.get(\n        config[\"type_of_data\"], \"default\"\n    )\n</code></pre>"},{"location":"modules/llm_module/#llm.get_llm_chain","title":"<code>get_llm_chain(config, local=False)</code>","text":"<p>Description: Get the LLM chain with the specified model and prompt template.</p> <p>Input: config (dict)</p> <p>Returns: LLMChain</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def get_llm_chain(config: dict, local:bool =False) -&gt; LLMChain|bool:\n    \"\"\"\n    Description: Get the LLM chain with the specified model and prompt template.\n\n    Input: config (dict)\n\n    Returns: LLMChain\n    \"\"\"\n    base_url = \"http://127.0.0.1:11434\" if local else \"http://ollama:11434\"\n    llm = Ollama(\n        model = config[\"llm_model\"] , base_url = base_url\n    )  \n    # llm = Ollama(\n        # model = config[\"llm_model\"]\n    # )\n    # print(llm)\n    map_template = config[\"llm_prompt_template\"]\n    map_prompt = PromptTemplate.from_template(map_template)\n    # return LLMChain(llm=llm, prompt=map_prompt)\n    return map_prompt | llm | StrOutputParser()\n</code></pre>"},{"location":"modules/llm_module/#llm.initialize_llm_chain","title":"<code>initialize_llm_chain(vectordb, config)</code>","text":"<p>Description: Initialize the LLM chain and setup Retrieval QA with the specified configuration.</p> <p>Input: vectordb (Chroma), config (dict)</p> <p>Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def initialize_llm_chain(\n    vectordb: Chroma,\n    config : dict\n) -&gt; langchain.chains.retrieval_qa.base.RetrievalQA:\n    \"\"\"\n    Description: Initialize the LLM chain and setup Retrieval QA with the specified configuration.\n\n    Input: vectordb (Chroma), config (dict)\n\n    Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)\n    \"\"\"\n\n    return vectordb.as_retriever(\n        search_type=config[\"search_type\"],\n        search_kwargs={\"k\": config[\"num_return_documents\"]},\n    )\n</code></pre>"},{"location":"modules/llm_module/#llm.load_and_process_data","title":"<code>load_and_process_data(metadata_df, page_content_column)</code>","text":"<p>Description: Load and process the data for the vector store. Split the documents into chunks of 1000 characters.</p> <p>Input: metadata_df (pd.DataFrame), page_content_column (str)</p> <p>Returns: chunked documents (list)</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def load_and_process_data(metadata_df: pd.DataFrame, page_content_column: str) -&gt; list:\n    \"\"\"\n    Description: Load and process the data for the vector store. Split the documents into chunks of 1000 characters.\n\n    Input: metadata_df (pd.DataFrame), page_content_column (str)\n\n    Returns: chunked documents (list)\n    \"\"\"\n    # Load data\n    loader = DataFrameLoader(metadata_df, page_content_column=page_content_column)\n    documents = loader.load()\n\n    # Split documents\n    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)\n    documents = text_splitter.split_documents(documents)\n\n    return documents\n</code></pre>"},{"location":"modules/llm_module/#llm.load_document_and_create_vector_store","title":"<code>load_document_and_create_vector_store(metadata_df, chroma_client, config)</code>","text":"<p>Loads the documents and creates the vector store. If the training flag is set to True, the documents are added to the vector store. If the training flag is set to False, the vector store is loaded from the persist directory.</p> <p>Parameters:</p> Name Type Description Default <code>metadata_df</code> <code>DataFrame</code> <p>The metadata dataframe.</p> required <code>chroma_client</code> <code>PersistentClient</code> <p>The Chroma client.</p> required <code>config</code> <code>dict</code> <p>The configuration dictionary.</p> required <p>Returns:</p> Name Type Description <code>Chroma</code> <code>Chroma</code> <p>The Chroma vector store.</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def load_document_and_create_vector_store(metadata_df: pd.DataFrame, chroma_client:ClientAPI , config: dict) -&gt; Chroma:\n    \"\"\"\n    Loads the documents and creates the vector store. If the training flag is set to True,\n    the documents are added to the vector store. If the training flag is set to False,\n    the vector store is loaded from the persist directory.\n\n    Args:\n        metadata_df (pd.DataFrame): The metadata dataframe.\n        chroma_client (chromadb.PersistentClient): The Chroma client.\n        config (dict): The configuration dictionary.\n\n    Returns:\n        Chroma: The Chroma vector store.\n    \"\"\"\n    embeddings = load_model(config)\n    collection_name = get_collection_name(config)\n\n    if not config[\"training\"]:\n        return load_vector_store(chroma_client, config, embeddings, collection_name)\n\n    return create_vector_store(\n        metadata_df, chroma_client, config, embeddings, collection_name\n    )\n</code></pre>"},{"location":"modules/llm_module/#llm.load_model","title":"<code>load_model(config)</code>","text":"<p>Description: Load the model using HuggingFaceEmbeddings.</p> <p>Input: config (dict)</p> <p>Returns: HuggingFaceEmbeddings</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def load_model(config: dict) -&gt; HuggingFaceEmbeddings | None:\n    \"\"\"\n    Description: Load the model using HuggingFaceEmbeddings.\n\n    Input: config (dict)\n\n    Returns: HuggingFaceEmbeddings\n    \"\"\"\n    print(\"[INFO] Loading model...\")\n    model_kwargs = {\"device\": config[\"device\"], \"trust_remote_code\": True}\n    encode_kwargs = {\"normalize_embeddings\": True}\n    embeddings = HuggingFaceEmbeddings(\n        model_name=config[\"embedding_model\"],\n        model_kwargs=model_kwargs,\n        encode_kwargs=encode_kwargs,\n        show_progress = True,\n        # trust_remote_code=True\n    )\n    print(\"[INFO] Model loaded.\")\n    return embeddings\n</code></pre>"},{"location":"modules/llm_module/#llm.load_vector_store","title":"<code>load_vector_store(chroma_client, config, embeddings, collection_name)</code>","text":"<p>Description: Load the vector store from the persist directory.</p> <p>Input: chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)</p> <p>Returns: Chroma</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def load_vector_store(chroma_client: ClientAPI, config: dict, embeddings: HuggingFaceEmbeddings, collection_name: str) -&gt; Chroma:\n    \"\"\"\n    Description: Load the vector store from the persist directory.\n\n    Input: chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)\n\n    Returns: Chroma\n    \"\"\"\n    if not os.path.exists(config[\"persist_dir\"]):\n        raise Exception(\n            \"Persist directory does not exist. Please run the training pipeline first.\"\n        )\n\n    return Chroma(\n        client=chroma_client,\n        persist_directory=config[\"persist_dir\"],\n        embedding_function=embeddings,\n        collection_name=collection_name,\n    )\n</code></pre>"},{"location":"modules/llm_module/#llm.setup_vector_db_and_qa","title":"<code>setup_vector_db_and_qa(config, data_type, client)</code>","text":"<p>Description: Create the vector database using Chroma db with each type of data in its own collection. Doing so allows us to have a single database with multiple collections, reducing the number of databases we need to manage. This also downloads the embedding model if it does not exist. The QA chain is then initialized with the vector store and the configuration.</p> <p>Input: config (dict), data_type (str), client (chromadb.PersistentClient)</p> <p>Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def setup_vector_db_and_qa(config: dict, data_type: str, client:ClientAPI) -&gt; langchain.chains.retrieval_qa.base.RetrievalQA:\n    \"\"\"\n    Description: Create the vector database using Chroma db with each type of data in its own collection. Doing so allows us to have a single database with multiple collections, reducing the number of databases we need to manage.\n    This also downloads the embedding model if it does not exist. The QA chain is then initialized with the vector store and the configuration.\n\n    Input: config (dict), data_type (str), client (chromadb.PersistentClient)\n\n    Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)\n    \"\"\"\n\n    config[\"type_of_data\"] = data_type\n    # Download the data if it does not exist\n    openml_data_object, data_id, all_metadata = get_all_metadata_from_openml(\n        config=config\n    )\n    # Create the combined metadata dataframe\n    metadata_df, all_metadata = create_metadata_dataframe(\n        openml_data_object, data_id, all_metadata, config=config\n    )\n    # Create the vector store\n    vectordb = load_document_and_create_vector_store(\n        metadata_df, config=config, chroma_client=client\n    )\n    # Initialize the LLM chain and setup Retrieval QA\n    qa = initialize_llm_chain(vectordb=vectordb, config=config)\n    return qa\n</code></pre>"},{"location":"modules/metadata_module/","title":"Metadata module","text":""},{"location":"modules/metadata_module/#metadata_utils.combine_metadata","title":"<code>combine_metadata(all_dataset_metadata, all_data_description_df)</code>","text":"<p>Description: Combine the descriptions with the metadata table.</p> <p>Input: all_dataset_metadata (pd.DataFrame) : The metadata table, all_data_description_df (pd.DataFrame) : The descriptions</p> <p>Returns: The combined metadata table.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def combine_metadata(all_dataset_metadata: pd.DataFrame, all_data_description_df: pd.DataFrame) -&gt; pd.DataFrame:\n    \"\"\"\n    Description: Combine the descriptions with the metadata table.\n\n    Input: all_dataset_metadata (pd.DataFrame) : The metadata table,\n    all_data_description_df (pd.DataFrame) : The descriptions\n\n    Returns: The combined metadata table.\n    \"\"\"\n    # Combine the descriptions with the metadata table\n    all_dataset_metadata = pd.merge(\n        all_dataset_metadata, all_data_description_df, on=\"did\", how=\"inner\"\n    )\n\n    # Create a single column that has a combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"\n\n    all_dataset_metadata[\"Combined_information\"] = all_dataset_metadata.apply(\n        merge_all_columns_to_string, axis=1\n    )\n    return all_dataset_metadata\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.create_combined_information_df","title":"<code>create_combined_information_df(data_id, descriptions, joined_qualities, joined_features)</code>","text":"<p>Description: Create a dataframe with the combined information of the OpenML object.</p> <p>Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object</p> <p>Returns: The dataframe with the combined information of the OpenML object.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def create_combined_information_df(\n    # data_id, descriptions, joined_qualities, joined_features\n    data_id: int| Sequence[int], descriptions: Sequence[str], joined_qualities: Sequence[str], joined_features: Sequence[str]\n) -&gt; pd.DataFrame:\n    \"\"\"\n    Description: Create a dataframe with the combined information of the OpenML object.\n\n    Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object\n\n    Returns: The dataframe with the combined information of the OpenML object.\n    \"\"\"\n    return pd.DataFrame(\n        {\n            \"did\": data_id,\n            \"description\": descriptions,\n            \"qualities\": joined_qualities,\n            \"features\": joined_features,\n        }\n    )\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.create_metadata_dataframe","title":"<code>create_metadata_dataframe(openml_data_object, data_id, all_dataset_metadata, config)</code>","text":"<p>Creates a dataframe with all the metadata, joined columns with all information for the type of data specified in the config. If training is set to False, the dataframes are loaded from the files. If training is set to True, the dataframes are created and then saved to the files.</p> <p>Parameters:</p> Name Type Description Default <code>openml_data_object</code> <code>list</code> <p>The list of OpenML objects.</p> required <code>data_id</code> <code>list</code> <p>The list of data ids.</p> required <code>all_dataset_metadata</code> <code>DataFrame</code> <p>The metadata table.</p> required <code>config</code> <code>dict</code> <p>The config dictionary.</p> required <p>Returns:</p> Type Description <code>DataFrame</code> <p>pd.DataFrame: The combined metadata dataframe.</p> <code>DataFrame</code> <p>pd.DataFrame: The updated metadata table.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def create_metadata_dataframe(\n    # openml_data_object, data_id, all_dataset_metadata, config\n    openml_data_object: Sequence[Union[openml.datasets.dataset.OpenMLDataset, openml.flows.flow.OpenMLFlow]], data_id: Sequence[int], all_dataset_metadata: pd.DataFrame, config: dict\n) -&gt; Tuple[pd.DataFrame, pd.DataFrame]:\n    \"\"\"\n    Creates a dataframe with all the metadata, joined columns with all information\n    for the type of data specified in the config. If training is set to False,\n    the dataframes are loaded from the files. If training is set to True, the\n    dataframes are created and then saved to the files.\n\n    Args:\n        openml_data_object (list): The list of OpenML objects.\n        data_id (list): The list of data ids.\n        all_dataset_metadata (pd.DataFrame): The metadata table.\n        config (dict): The config dictionary.\n\n    Returns:\n        pd.DataFrame: The combined metadata dataframe.\n        pd.DataFrame: The updated metadata table.\n    \"\"\"\n    # use os.path.join to ensure compatibility with different operating systems\n    file_path = os.path.join(\n        config[\"data_dir\"], f\"all_{config['type_of_data']}_description.csv\"\n    )\n\n    if not config[\"training\"]:\n        return load_metadata(file_path), all_dataset_metadata\n\n    if config[\"type_of_data\"] == \"dataset\":\n        return process_dataset_metadata(\n            openml_data_object, data_id, all_dataset_metadata, file_path\n        )\n\n    if config[\"type_of_data\"] == \"flow\":\n        return process_flow_metadata(openml_data_object, data_id, file_path)\n\n    raise ValueError(f\"Unsupported type_of_data: {config['type_of_data']}\")\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.extract_attribute","title":"<code>extract_attribute(attribute, attr_name)</code>","text":"<p>Description: Extract an attribute from the OpenML object.</p> <p>Input: attribute (object) : The OpenML object</p> <p>Returns: The attribute value if it exists, else an empty string.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def extract_attribute(attribute: object, attr_name: str) -&gt; str:\n    \"\"\"\n    Description: Extract an attribute from the OpenML object.\n\n    Input: attribute (object) : The OpenML object\n\n    Returns: The attribute value if it exists, else an empty string.\n    \"\"\"\n    return getattr(attribute, attr_name, \"\")\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.get_all_metadata_from_openml","title":"<code>get_all_metadata_from_openml(config)</code>","text":"<p>Description: Gets all the metadata from OpenML for the type of data specified in the config. If training is set to False, it loads the metadata from the files. If training is set to True, it gets the metadata from OpenML.</p> <p>This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.</p> <p>Input: config (dict) : The config dictionary</p> <p>Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def get_all_metadata_from_openml(config: dict) -&gt; Tuple[pd.DataFrame, Sequence[int], pd.DataFrame] | None:\n    \"\"\"\n    Description: Gets all the metadata from OpenML for the type of data specified in the config.\n    If training is set to False, it loads the metadata from the files. If training is set to True, it gets the metadata from OpenML.\n\n    This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.\n\n\n    Input: config (dict) : The config dictionary\n\n    Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.\n    \"\"\"\n\n    # save_filename = f\"./data/all_{config['type_of_data']}_metadata.pkl\"\n    # use os.path.join to ensure compatibility with different operating systems\n    save_filename = os.path.join(\n        config[\"data_dir\"], f\"all_{config['type_of_data']}_metadata.pkl\"\n    )\n    # If we are not training, we do not need to recreate the cache and can load the metadata from the files. If the files do not exist, raise an exception.\n    # TODO : Check if this behavior is correct, or if data does not exist, send to training pipeline?\n    if config[\"training\"] == False or config[\"ignore_downloading_data\"] == True:\n        # print(\"[INFO] Training is set to False.\")\n        # Check if the metadata files exist for all types of data\n        if not os.path.exists(save_filename):\n            raise Exception(\n                \"Metadata files do not exist. Please run the training pipeline first.\"\n            )\n        print(\"[INFO] Loading metadata from file.\")\n        # Load the metadata files for all types of data\n        return load_metadata_from_file(save_filename)\n\n    # If we are training, we need to recreate the cache and get the metadata from OpenML\n    if config[\"training\"] == True:\n        print(\"[INFO] Training is set to True.\")\n        # Gather all OpenML objects of the type of data\n        all_objects = get_openml_objects(config[\"type_of_data\"])\n\n        # subset the data for testing\n        if config[\"test_subset_2000\"] == True:\n            print(\"[INFO] Subsetting the data to 100 rows.\")\n            all_objects = all_objects[:100]\n\n        data_id = [int(all_objects.iloc[i][\"did\"]) for i in range(len(all_objects))]\n\n        print(\"[INFO] Initializing cache.\")\n        initialize_cache(config[\"type_of_data\"], data_id)\n\n        print(f\"[INFO] Getting {config['type_of_data']} metadata from OpenML.\")\n        openml_data_object = get_metadata_from_openml(config, data_id)\n\n        print(\"[INFO] Saving metadata to file.\")\n        save_metadata_to_file((openml_data_object, data_id, all_objects), save_filename)\n\n        return openml_data_object, data_id, all_objects\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.get_dataset_description","title":"<code>get_dataset_description(dataset_id)</code>","text":"<p>Get the dataset description from OpenML using the dataset id</p> <p>Input: dataset_id (int) : The dataset id</p> <p>Returns: data (openml.datasets.dataset.OpenMLDataset) : The dataset object from OpenML</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def get_dataset_description(dataset_id) -&gt; openml.datasets.dataset.OpenMLDataset:\n    \"\"\"\n    Get the dataset description from OpenML using the dataset id\n\n    Input: dataset_id (int) : The dataset id\n\n    Returns: data (openml.datasets.dataset.OpenMLDataset) : The dataset object from OpenML\n    \"\"\"\n    # TODO : Check for objects that do not have qualities being not downloaded properly\n    # try:\n    data = openml.datasets.get_dataset(\n        dataset_id=dataset_id,\n        download_data=False,\n        download_qualities=True,\n        download_features_meta_data=True,\n    )\n\n    return data\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.get_flow_description","title":"<code>get_flow_description(flow_id)</code>","text":"<p>Get the flow description from OpenML using the flow id</p> <p>Input: flow_id (int) : The flow id</p> <p>Returns: data (openml.flows.flow.OpenMLFlow) : The flow object from OpenML</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def get_flow_description(flow_id: int) -&gt; openml.flows.flow.OpenMLFlow:\n    \"\"\"\n    Get the flow description from OpenML using the flow id\n\n    Input: flow_id (int) : The flow id\n\n    Returns: data (openml.flows.flow.OpenMLFlow) : The flow object from OpenML\n    \"\"\"\n    return openml.flows.get_flow(flow_id=flow_id)\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.get_metadata_from_openml","title":"<code>get_metadata_from_openml(config, data_id)</code>","text":"<p>Get metadata from OpenML using parallel processing.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def get_metadata_from_openml(config, data_id: Sequence[int]):\n    \"\"\"\n    Get metadata from OpenML using parallel processing.\n    \"\"\"\n    if config[\"type_of_data\"] == \"dataset\":\n        return pqdm(\n            data_id, get_dataset_description, n_jobs=config[\"data_download_n_jobs\"]\n        )\n    elif config[\"type_of_data\"] == \"flow\":\n        return pqdm(\n            data_id, get_flow_description, n_jobs=config[\"data_download_n_jobs\"]\n        )\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.get_openml_objects","title":"<code>get_openml_objects(type_of_data)</code>","text":"<p>Get OpenML objects based on the type of data.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def get_openml_objects(type_of_data: str):\n    \"\"\"\n    Get OpenML objects based on the type of data.\n    \"\"\"\n    if type_of_data == \"dataset\":\n        return openml.datasets.list_datasets(output_format=\"dataframe\")\n    elif type_of_data == \"flow\":\n        all_objects = openml.flows.list_flows(output_format=\"dataframe\")\n        return all_objects.rename(columns={\"id\": \"did\"})\n    else:\n        raise ValueError(\"Invalid type_of_data specified\")\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.initialize_cache","title":"<code>initialize_cache(type_of_data, data_id)</code>","text":"<p>Initialize cache for the OpenML objects.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def initialize_cache(type_of_data: str, data_id: Sequence[int]) -&gt; None:\n    \"\"\"\n    Initialize cache for the OpenML objects.\n    \"\"\"\n    if type_of_data == \"dataset\":\n        get_dataset_description(data_id[0])\n    elif type_of_data == \"flow\":\n        get_flow_description(data_id[0])\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.join_attributes","title":"<code>join_attributes(attribute, attr_name)</code>","text":"<p>Description: Join the attributes of the OpenML object.</p> <p>Input: attribute (object) : The OpenML object</p> <p>Returns: The joined attributes if they exist, else an empty string. example: \"column - value, column - value, ...\"</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def join_attributes(attribute: object, attr_name: str) -&gt; str:\n    \"\"\"\n    Description: Join the attributes of the OpenML object.\n\n    Input: attribute (object) : The OpenML object\n\n    Returns: The joined attributes if they exist, else an empty string.\n    example: \"column - value, column - value, ...\"\n    \"\"\"\n\n    return (\n        \" \".join([f\"{k} : {v},\" for k, v in getattr(attribute, attr_name, {}).items()])\n        if hasattr(attribute, attr_name)\n        else \"\"\n    )\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.load_metadata_from_file","title":"<code>load_metadata_from_file(save_filename)</code>","text":"<p>Load metadata from a file.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def load_metadata_from_file(save_filename: str) -&gt; Tuple[pd.DataFrame, Sequence[int], pd.DataFrame]:\n    \"\"\"\n    Load metadata from a file.\n    \"\"\"\n    with open(save_filename, \"rb\") as f:\n        return pickle.load(f)\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.merge_all_columns_to_string","title":"<code>merge_all_columns_to_string(row)</code>","text":"<p>Description: Create a single column that has a combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"</p> <p>Input: row (pd.Series) : The row of the dataframe</p> <p>Returns: The combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def merge_all_columns_to_string(row: pd.Series) -&gt; str:\n    \"\"\"\n    Description: Create a single column that has a combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"\n\n    Input: row (pd.Series) : The row of the dataframe\n\n    Returns: The combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"\n    \"\"\"\n\n    return \" \".join([f\"{col} - {val},\" for col, val in zip(row.index, row.values)])\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.process_dataset_metadata","title":"<code>process_dataset_metadata(openml_data_object, data_id, all_dataset_metadata, file_path)</code>","text":"<p>Description: Process the dataset metadata.</p> <p>Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path</p> <p>Returns: The combined metadata dataframe and the updated metadata table.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def process_dataset_metadata(\n    openml_data_object: Sequence[openml.datasets.dataset.OpenMLDataset], data_id: Sequence[int], all_dataset_metadata: pd.DataFrame, file_path: str\n) -&gt; Tuple[pd.DataFrame, pd.DataFrame]:\n    \"\"\"\n    Description: Process the dataset metadata.\n\n    Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path\n\n    Returns: The combined metadata dataframe and the updated metadata table.\n    \"\"\"\n    descriptions = [\n        extract_attribute(attr, \"description\") for attr in openml_data_object\n    ]\n    joined_qualities = [\n        join_attributes(attr, \"qualities\") for attr in openml_data_object\n    ]\n    joined_features = [join_attributes(attr, \"features\") for attr in openml_data_object]\n\n    all_data_description_df = create_combined_information_df(\n        data_id, descriptions, joined_qualities, joined_features\n    )\n    all_dataset_metadata = combine_metadata(\n        all_dataset_metadata, all_data_description_df\n    )\n\n    all_dataset_metadata.to_csv(file_path)\n\n    return (\n        all_dataset_metadata[[\"did\", \"name\", \"Combined_information\"]],\n        all_dataset_metadata,\n    )\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.process_flow_metadata","title":"<code>process_flow_metadata(openml_data_object, data_id, file_path)</code>","text":"<p>Description: Process the flow metadata.</p> <p>Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, file_path (str) : The file path</p> <p>Returns: The combined metadata dataframe and the updated metadata table.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def process_flow_metadata(openml_data_object: Sequence[openml.flows.flow.OpenMLFlow], data_id: Sequence[int], file_path: str) -&gt; Tuple[pd.DataFrame, pd.DataFrame]:\n    \"\"\"\n    Description: Process the flow metadata.\n\n    Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, file_path (str) : The file path\n\n    Returns: The combined metadata dataframe and the updated metadata table.\n    \"\"\"\n    descriptions = [\n        extract_attribute(attr, \"description\") for attr in openml_data_object\n    ]\n    names = [extract_attribute(attr, \"name\") for attr in openml_data_object]\n    tags = [extract_attribute(attr, \"tags\") for attr in openml_data_object]\n\n    all_data_description_df = pd.DataFrame(\n        {\n            \"did\": data_id,\n            \"description\": descriptions,\n            \"name\": names,\n            \"tags\": tags,\n        }\n    )\n\n    all_data_description_df[\"Combined_information\"] = all_data_description_df.apply(\n        merge_all_columns_to_string, axis=1\n    )\n    all_data_description_df.to_csv(file_path)\n\n    return (\n        all_data_description_df[[\"did\", \"name\", \"Combined_information\"]],\n        all_data_description_df,\n    )\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.save_metadata_to_file","title":"<code>save_metadata_to_file(data, save_filename)</code>","text":"<p>Save metadata to a file.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def save_metadata_to_file(data, save_filename: str):\n    \"\"\"\n    Save metadata to a file.\n    \"\"\"\n    with open(save_filename, \"wb\") as f:\n        pickle.dump(data, f)\n</code></pre>"},{"location":"modules/result_gen/","title":"Result gen","text":""},{"location":"modules/result_gen/#results_gen.aggregate_multiple_queries_and_count","title":"<code>aggregate_multiple_queries_and_count(queries, qa_dataset, config, group_cols=['id', 'name'], sort_by='query', count=True)</code>","text":"<p>Description: Aggregate the results of multiple queries into a single dataframe and count the number of times a dataset appears in the results</p> Input <p>queries: List of queries group_cols: List of columns to group by</p> <p>Returns: Combined dataframe with the results of all queries</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def aggregate_multiple_queries_and_count(\n    queries, qa_dataset, config, group_cols=[\"id\", \"name\"], sort_by=\"query\", count = True\n) -&gt; pd.DataFrame:\n    \"\"\"\n    Description: Aggregate the results of multiple queries into a single dataframe and count the number of times a dataset appears in the results\n\n    Input:\n        queries: List of queries\n        group_cols: List of columns to group by\n\n    Returns: Combined dataframe with the results of all queries\n    \"\"\"\n    combined_df = pd.DataFrame()\n    for query in tqdm(queries, total=len(queries)):\n        result_data_frame, _ = get_result_from_query(\n            query=query, qa=qa_dataset, type_of_query=\"dataset\", config=config\n        )\n        result_data_frame = result_data_frame[group_cols]\n        # Concat with combined_df with a column to store the query\n        result_data_frame[\"query\"] = query\n        combined_df = pd.concat([combined_df, result_data_frame])\n    if count:\n        combined_df = (\n        combined_df.groupby(group_cols)\n        .count()\n        .reset_index()\n        .sort_values(by=sort_by, ascending=False)\n    )\n\n    return combined_df\n</code></pre>"},{"location":"modules/result_gen/#results_gen.check_query","title":"<code>check_query(query)</code>","text":"<p>Description: Performs checks on the query - Replaces %20 with space character (browsers do this automatically when spaces are in the URL) - Removes leading and trailing spaces - Limits the query to 150 characters</p> <p>Input: query (str)</p> <p>Returns: None</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def check_query(query: str) -&gt; str:\n    \"\"\"\n    Description: Performs checks on the query\n    - Replaces %20 with space character (browsers do this automatically when spaces are in the URL)\n    - Removes leading and trailing spaces\n    - Limits the query to 150 characters\n\n    Input: query (str)\n\n    Returns: None\n    \"\"\"\n    if query == \"\":\n        raise ValueError(\"Query cannot be empty.\")\n    query = query.replace(\n        \"%20\", \" \"\n    )  # replace %20 with space character (browsers do this automatically when spaces are in the URL)\n    # query = query.replace(\"dataset\", \"\")\n    # query = query.replace(\"flow\", \"\")\n    query = query.strip()\n    query = query[:200]\n    return query\n</code></pre>"},{"location":"modules/result_gen/#results_gen.create_output_dataframe","title":"<code>create_output_dataframe(dict_results, type_of_data, ids_order)</code>","text":"<p>Description: Create an output dataframe with the results. The URLs are API calls to the OpenML API for the specific type of data.</p> <p>Input: dict_results (dict), type_of_data (str)</p> <p>Returns: A dataframe with the results and duplicate names removed.</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def create_output_dataframe(dict_results: dict, type_of_data: str, ids_order: list) -&gt; pd.DataFrame:\n    \"\"\"\n    Description: Create an output dataframe with the results. The URLs are API calls to the OpenML API for the specific type of data.\n\n    Input: dict_results (dict), type_of_data (str)\n\n    Returns: A dataframe with the results and duplicate names removed.\n    \"\"\"\n    output_df = pd.DataFrame(dict_results).T.reset_index()\n    # order the rows based on the order of the ids\n    output_df[\"index\"] = output_df[\"index\"].astype(int)\n    output_df = output_df.set_index(\"index\").loc[ids_order].reset_index()\n    # output_df[\"urls\"] = output_df[\"index\"].apply(\n    #     lambda x: f\"https://www.openml.org/api/v1/json/{type_of_data}/{x}\"\n    # )\n    # https://www.openml.org/search?type=data&amp;sort=runs&amp;status=any&amp;id=31\n    output_df[\"urls\"] = output_df[\"index\"].apply(\n        lambda x: f\"https://www.openml.org/search?type={type_of_data}&amp;id={x}\"\n    )\n    output_df[\"urls\"] = output_df[\"urls\"].apply(make_clickable)\n    # data = openml.datasets.get_dataset(\n    # get rows with unique names\n    if type_of_data == \"data\":\n        output_df[\"command\"] = output_df[\"index\"].apply(\n            lambda x: f\"dataset = openml.datasets.get_dataset({x})\"\n        )\n    elif type_of_data == \"flow\":\n        output_df[\"command\"] = output_df[\"index\"].apply(\n            lambda x: f\"flow = openml.flows.get_flow({x})\"\n        )\n    output_df = output_df.drop_duplicates(subset=[\"name\"])\n    # order the columns\n    output_df = output_df[[\"index\", \"name\", \"command\", \"urls\", \"page_content\"]].rename(\n        columns={\"index\": \"id\", \"urls\": \"OpenML URL\", \"page_content\": \"Description\"}\n    )\n    return output_df\n</code></pre>"},{"location":"modules/result_gen/#results_gen.fetch_results","title":"<code>fetch_results(query, qa, type_of_query, config)</code>","text":"<p>Description: Fetch results for the query using the QA chain.</p> <p>Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str), config (dict)</p> <p>Returns: results[\"source_documents\"] (list)</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def fetch_results(query: str, qa: langchain.chains.retrieval_qa.base.RetrievalQA, type_of_query: str, config: dict) -&gt; Sequence[Document]:\n    \"\"\"\n    Description: Fetch results for the query using the QA chain.\n\n    Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str), config (dict)\n\n    Returns: results[\"source_documents\"] (list)\n    \"\"\"\n    results = qa.invoke(\n        input=query,\n        config={\"temperature\": config[\"temperature\"], \"top-p\": config[\"top_p\"]},\n    )\n    if config[\"long_context_reorder\"] == True:\n        results = long_context_reorder(results)\n    id_column = {\"dataset\": \"did\", \"flow\": \"id\", \"data\": \"did\"}\n    id_column = id_column[type_of_query]\n\n    if config[\"reranking\"] == True:\n        try:\n            print(\"[INFO] Reranking results...\")\n            ranker = Ranker(model_name=\"ms-marco-MiniLM-L-12-v2\", cache_dir=\"/tmp/\")\n            rerankrequest = RerankRequest(\n                query=query,\n                passages=[\n                    {\"id\": result.metadata[id_column], \"text\": result.page_content}\n                    for result in results\n                ],\n            )\n            ranking = ranker.rerank(rerankrequest)\n            ids = [result[\"id\"] for result in ranking]\n            ranked_results = [\n                result for result in results if result.metadata[id_column] in ids\n            ]\n            print(\"[INFO] Reranking complete.\")\n            return ranked_results\n        except Exception as e:\n            print(f\"[ERROR] Reranking failed: {e}\")\n            return results\n\n    else:\n        return results\n</code></pre>"},{"location":"modules/result_gen/#results_gen.get_result_from_query","title":"<code>get_result_from_query(query, qa, type_of_query, config)</code>","text":"<p>Description: Get the result from the query using the QA chain and return the results in a dataframe that is then sent to the frontend.</p> <p>Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str)</p> <p>Returns: output_df (pd.DataFrame)</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def get_result_from_query(query, qa, type_of_query, config) -&gt; Tuple[pd.DataFrame, Sequence[Document]]:\n    \"\"\"\n    Description: Get the result from the query using the QA chain and return the results in a dataframe that is then sent to the frontend.\n\n    Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str)\n\n    Returns: output_df (pd.DataFrame)\n    \"\"\"\n    if type_of_query == \"dataset\":\n        # Fixing the key_name for dataset because of the way the OpenML API returns the data\n        type_of_query = \"data\"\n    elif type_of_query == \"flow\":\n        type_of_query = \"flow\"\n    else:\n        raise ValueError(f\"Unsupported type_of_data: {type_of_query}\")\n\n    # Process the query\n    query = check_query(query)\n    if query == \"\":\n        return pd.DataFrame(), []\n    source_documents = fetch_results(\n        query, qa, config=config, type_of_query=type_of_query\n    )\n    dict_results, ids_order = process_documents(source_documents)\n    output_df = create_output_dataframe(dict_results, type_of_query, ids_order)\n\n    return output_df, source_documents\n</code></pre>"},{"location":"modules/result_gen/#results_gen.long_context_reorder","title":"<code>long_context_reorder(results)</code>","text":"<p>Description: Lost in the middle reorder: the less relevant documents will be at the middle of the list and more relevant elements at beginning / end. See: https://arxiv.org/abs//2307.03172</p> <p>Input: results (list)</p> <p>Returns: reorder results (list)</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def long_context_reorder(results: Sequence[Document]) -&gt; Sequence[Document]:\n    \"\"\"\n    Description: Lost in the middle reorder: the less relevant documents will be at the\n    middle of the list and more relevant elements at beginning / end.\n    See: https://arxiv.org/abs//2307.03172\n\n    Input: results (list)\n\n    Returns: reorder results (list)\n    \"\"\"\n    print(\"[INFO] Reordering results...\")\n    reordering = LongContextReorder()\n    results = reordering.transform_documents(results)\n    print(\"[INFO] Reordering complete.\")\n    return results\n</code></pre>"},{"location":"modules/result_gen/#results_gen.make_clickable","title":"<code>make_clickable(val)</code>","text":"<p>Description: Make the URL clickable in the dataframe.</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def make_clickable(val : str) -&gt; str:\n    \"\"\"\n    Description: Make the URL clickable in the dataframe.\n    \"\"\"\n    return '&lt;a href=\"{}\"&gt;{}&lt;/a&gt;'.format(val, val)\n</code></pre>"},{"location":"modules/result_gen/#results_gen.process_documents","title":"<code>process_documents(source_documents)</code>","text":"<p>Description: Process the source documents and create a dictionary with the key_name as the key and the name and page content as the values.</p> <p>Input: source_documents (list), key_name (str)</p> <p>Returns: dict_results (dict)</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def process_documents(source_documents : Sequence[Document]) -&gt; Tuple[OrderedDict, list]:\n    \"\"\"\n    Description: Process the source documents and create a dictionary with the key_name as the key and the name and page content as the values.\n\n    Input: source_documents (list), key_name (str)\n\n    Returns: dict_results (dict)\n    \"\"\"\n    dict_results = OrderedDict()\n    for result in source_documents:\n        dict_results[result.metadata[\"did\"]] = {\n            \"name\": result.metadata[\"name\"],\n            \"page_content\": result.page_content,\n        }\n    ids = [result.metadata[\"did\"] for result in source_documents]\n    return dict_results, ids\n</code></pre>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"RAG pipeline for OpenML","text":"<ul> <li>This repository contains the code for the RAG pipeline for OpenML. </li> <li>Project roadmap</li> </ul>"},{"location":"#getting-started","title":"Getting started","text":"<ul> <li>Clone the repository</li> <li>Create a virtual environment and activate it</li> <li>Install the requirements using <code>pip install -r requirements.txt</code></li> <li>Run training.py (for the first time/to update the model). This takes care of basically everything. (Refer to the training section for more details)</li> <li>Install Ollama (https://ollama.com/) for your machine</li> </ul> <ul> <li>For a local setup, you can run ./start_local.sh to start Olama, FastAPI and Streamlit servers. The Streamlit server will be available at http://localhost:8501</li> <li>For docker, refer to Docker</li> <li>For a complete usage example refer to pipeline usage</li> <li>Enjoy :)</li> </ul>"},{"location":"#example-usage","title":"Example usage","text":"<ul> <li>Note that in this picture, I am using a very very tiny model for demonstration purposes. The actual results would be a lot better :)</li> <li></li> </ul>"},{"location":"#where-do-i-go-from-here","title":"Where do I go from here?","text":""},{"location":"#i-am-a-developer-and-i-want-to-contribute-to-the-project","title":"I am a developer and I want to contribute to the project","text":"<ul> <li>Hello! We are glad you are here. To get started, refer to the tutorials in the developer tutorial section.</li> <li>If you have any questions, feel free to ask or post an issue.</li> </ul>"},{"location":"#i-just-want-to-use-the-pipeline","title":"I just want to use the pipeline","text":"<ul> <li>You can use the pipeline by running the Streamlit frontend. Refer to the getting started section above for more details.</li> </ul>"},{"location":"#i-am-on-the-wrong-page","title":"I am on the wrong page","text":""},{"location":"configuration/","title":"Configuration","text":"<ul> <li>The main config file is <code>config.json</code> </li> <li>Possible options are as follows:</li> <li>rqa_prompt_template: The template for the RAG pipeline search prompt. This is used by the model to query the database. </li> <li>llm_prompt_template: The template for the summary generator LLM prompt.</li> <li>num_return_documents: Number of documents to return for a query. Too high a number can lead to Out of Memory errors. (Defaults to 50)</li> <li>embedding_model: The model to use for generating embeddings. This is used to generate embeddings for the documents as a means of comparison using the LLM's embeddings. (Defaults to BAAI/bge-large-en-v1.5)<ul> <li>Other possible tested models<ul> <li>BAAI/bge-base-en-v1.5</li> <li>BAAI/bge-large-en-v1.5</li> <li>WhereIsAI/UAE-Large-V1</li> </ul> </li> </ul> </li> <li>llm_model: The model used for generating the result summary. (Defaults to qwen2:1.5b)</li> <li>data_dir: The directory to store the intermediate data like tables/databases etc. (Defaults to ./data/)</li> <li>persist_dir: The directory to store the cached data. Defaults to ./data/chroma_db/ and stores the embeddings for the documents with a unique hash. (Defaults to ./data/chroma_db/)</li> <li>testing_flag: Enables testing mode by using subsets of the data for quick debugging. This is used to test the pipeline and is not recommended for normal use. (Defaults to False)</li> <li>data_download_n_jobs: Number of jobs to run in parallel for downloading data. (Defaults to 20)</li> <li>training: Whether to train the model or not. (Defaults to False) this is automatically set to True when when running the training.py script. Do NOT set this to True manually.</li> <li>search_type : The type of vector comparison to use. (Defaults to \"similarity\")</li> <li>reraanking: Whether to rerank the results using the FlashRank algorithm. (Defaults to False)</li> <li>long_context_reordering: Whether to reorder the results using the Long Context Reordering algorithm. (Defaults to False)</li> </ul>"},{"location":"docker/","title":"Docker container","text":""},{"location":"docker/#building","title":"Building","text":"<ul> <li>Run <code>docker compose build --progress=plain</code></li> </ul>"},{"location":"docker/#running","title":"Running","text":"<ul> <li>Run <code>./start_docker.sh</code></li> <li>This uses the docker compose file to run the docker process in the background.</li> <li>The required LLM model is also pulled from the docker hub and the container is started.</li> </ul>"},{"location":"docker/#stopping","title":"Stopping","text":"<ul> <li>Run <code>./stop_docker.sh</code></li> </ul>"},{"location":"docker/#potential-errors","title":"Potential Errors","text":"<ul> <li>Permission errors : Run <code>chmod +x *.sh</code></li> <li>If you get a memory error you can run <code>docker system prune</code>. Please be careful with this command as it will remove all stopped containers, all dangling images, and all unused networks. So ensure you have no important data in any of the containers before running this command.</li> <li>On docker desktop for Mac, increase memory limits to as much as your system can handle.</li> </ul>"},{"location":"inference/","title":"Inference","text":"<ul> <li>Just run ./start_local.sh and it will take care of everything.</li> <li>The UI should either pop up or you can navigate to http://localhost:8501/ in your browser.</li> <li>Note that it takes a decent bit of time to load everything. (Approximately 10-15 mins on a decent Macbook Pro, and much slower with Docker)</li> </ul>"},{"location":"inference/#stopping","title":"Stopping","text":"<ul> <li>Run ./stop_local.sh</li> <li>./start_local.sh stores the PIDs of all the processes it starts in files in all the directories it starts them in. stop_local.sh reads these files and kills the processes.</li> </ul>"},{"location":"inference/#errors","title":"Errors","text":"<ul> <li>If you get an error about file permissions, run <code>chmod +x start_local.sh</code> and <code>chmod +x stop_local.sh</code> to make them executable.</li> </ul>"},{"location":"inference/#ui_utils.display_results","title":"<code>display_results(initial_response)</code>","text":"<p>Description: Display the results in a DataFrame</p> <p>Input: initial_response (DataFrame)</p> <p>Returns: None</p> Source code in <code>frontend/ui_utils.py</code> <pre><code>def display_results(initial_response):\n    \"\"\"\n    Description: Display the results in a DataFrame\n\n    Input: initial_response (DataFrame)\n\n    Returns: None\n    \"\"\"\n    st.write(\"Results:\")\n    st.dataframe(initial_response)\n</code></pre>"},{"location":"inference/#ui_utils.feedback_cb","title":"<code>feedback_cb()</code>","text":"<p>Description: Callback function to save feedback to a file</p> <p>Input: None</p> <p>Returns: None</p> Source code in <code>frontend/ui_utils.py</code> <pre><code>def feedback_cb():\n    \"\"\"\n    Description: Callback function to save feedback to a file\n\n    Input: None\n\n    Returns: None\n    \"\"\"\n    file_path = \"feedback.json\"\n\n    if os.path.exists(file_path):\n        with open(file_path, \"r\") as file:\n            try:\n                data = json.load(file)\n            except json.JSONDecodeError:\n                data = []\n    else:\n        data = []\n\n    # Append new feedback\n    data.append({\"ss\": ss.fb_k, \"query\": ss.query})\n\n    # Write updated content back to the file\n    with open(file_path, \"w\") as file:\n        json.dump(data, file, indent=4)\n</code></pre>"},{"location":"inference/#ui_utils.fetch_llm_response","title":"<code>fetch_llm_response(query)</code>","text":"<p>Description: Fetch the response from the LLM service</p> <p>Input: query (str)</p> <p>Returns: llm_response (dict)</p> Source code in <code>frontend/ui_utils.py</code> <pre><code>def fetch_llm_response(query):\n    \"\"\"\n    Description: Fetch the response from the LLM service\n\n    Input: query (str)\n\n    Returns: llm_response (dict)\n    \"\"\"\n    try:\n        llm_response = requests.get(f\"http://fastapi:8081/llmquery/{query}\").json()\n    except:\n        llm_response = requests.get(f\"http://0.0.0.0:8081/llmquery/{query}\").json()\n    return llm_response\n</code></pre>"},{"location":"inference/#ui_utils.fetch_response","title":"<code>fetch_response(query_type, query)</code>","text":"<p>Description: Fetch the response from the FastAPI service</p> <p>Input: query_type (str), query (str)</p> <p>Returns: response (dict)</p> Source code in <code>frontend/ui_utils.py</code> <pre><code>def fetch_response(query_type, query):\n    \"\"\"\n    Description: Fetch the response from the FastAPI service\n\n    Input: query_type (str), query (str)\n\n    Returns: response (dict)\n    \"\"\"\n    try:\n        response = requests.get(\n            f\"http://fastapi:8000/{query_type.lower()}/{query}\",\n            json={\"query\": query, \"type\": query_type.lower()},\n        ).json()\n    except:\n        response = requests.get(\n            f\"http://0.0.0.0:8000/{query_type.lower()}/{query}\",\n            json={\"query\": query, \"type\": query_type.lower()},\n        ).json()\n    return response\n</code></pre>"},{"location":"inference/#ui_utils.filter_initial_response","title":"<code>filter_initial_response(response, classification)</code>","text":"<p>Description: Filter the initial response based on the classification</p> <p>Input: response (DataFrame), classification (str)</p> <p>Returns: response (DataFrame)</p> Source code in <code>frontend/ui_utils.py</code> <pre><code>def filter_initial_response(response, classification):\n    \"\"\"\n    Description: Filter the initial response based on the classification\n\n    Input: response (DataFrame), classification (str)\n\n    Returns: response (DataFrame)\n    \"\"\"\n    if classification != \"none\":\n        if \"multi\" in classification:\n            response = response[response[\"NumberOfClasses\"] &gt; 2]\n        elif \"binary\" in classification:\n            response = response[response[\"NumberOfClasses\"] == 2]\n    return response\n</code></pre>"},{"location":"inference/#ui_utils.parse_and_update_response","title":"<code>parse_and_update_response(query_type, response, llm_response, data_metadata, flow_metadata)</code>","text":"<p>Description: Parse and update the response based on the query type</p> <p>Input: query_type (str), response (dict), llm_response (dict), data_metadata (DataFrame), flow_metadata (DataFrame)</p> <p>Returns: initial_response (DataFrame)</p> Source code in <code>frontend/ui_utils.py</code> <pre><code>def parse_and_update_response(query_type, response, llm_response, data_metadata, flow_metadata):\n    \"\"\"\n    Description: Parse and update the response based on the query type\n\n    Input: query_type (str), response (dict), llm_response (dict), data_metadata (DataFrame), flow_metadata (DataFrame)\n\n    Returns: initial_response (DataFrame)\n    \"\"\"\n    if query_type == \"Dataset\":\n        initial_response = data_metadata[data_metadata[\"did\"].isin(response[\"initial_response\"])]\n        subset_cols = [\"did\", \"name\"]\n        try:\n            dataset_size, dataset_missing, dataset_classification, dataset_sort = parse_llm_response(llm_response)\n            subset_cols = update_subset_cols(dataset_size, dataset_missing, dataset_classification)\n            initial_response = filter_initial_response(initial_response, dataset_classification)\n        except Exception as e:\n            st.error(f\"Error processing LLM response: {e}\")\n        initial_response = initial_response[subset_cols]\n    else:\n        initial_response = flow_metadata[flow_metadata[\"id\"].isin(response[\"initial_response\"])]\n    return initial_response\n</code></pre>"},{"location":"inference/#ui_utils.parse_llm_response","title":"<code>parse_llm_response(response)</code>","text":"<p>Description: Parse the answers from the LLM response</p> <p>Input: response (dict)</p> <p>Returns: size (str), missing (str), classification (str), sort (str)</p> Source code in <code>frontend/ui_utils.py</code> <pre><code>def parse_llm_response(response):\n    \"\"\"\n    Description: Parse the answers from the LLM response\n\n    Input: response (dict)\n\n    Returns: size (str), missing (str), classification (str), sort (str)\n    \"\"\"\n    size, missing, classification = response[\"answers\"]\n    size, sort = size.split(\",\") if \",\" in size else (size, None)\n    return size, missing, classification, sort\n</code></pre>"},{"location":"inference/#ui_utils.run_streamlit","title":"<code>run_streamlit()</code>","text":"<p>Description: Run the Streamlit app</p> <p>Input: None</p> <p>Returns: None</p> Source code in <code>frontend/ui_utils.py</code> <pre><code>def run_streamlit():\n    \"\"\"\n    Description: Run the Streamlit app\n\n    Input: None\n\n    Returns: None\n    \"\"\"\n    if st.button(\"Submit\"):\n        with st.spinner(\"Waiting for results...\"):\n            query_type = st.session_state['query_type']\n            query = st.session_state['query']\n            data_metadata = st.session_state['data_metadata']\n            flow_metadata = st.session_state['flow_metadata']\n\n            response = fetch_response(query_type, query)\n\n        if response[\"initial_response\"] is not None:\n            if query_type == \"Dataset\":\n                with st.spinner(\"Using an LLM to find the most relevant information...\"):\n                    llm_response = fetch_llm_response(query)\n                    initial_response = parse_and_update_response(query_type, response, llm_response, data_metadata, flow_metadata)\n            else:\n                initial_response = parse_and_update_response(query_type, response, None, data_metadata, flow_metadata)\n\n            display_results(initial_response)\n\n        with st.form(\"fb_form\"):\n            streamlit_feedback(\n                feedback_type=\"thumbs\",\n                align=\"flex-start\",\n                key=\"fb_k\",\n                optional_text_label=\"[Optional] Please provide an explanation\",\n            )\n            st.form_submit_button(\"Save feedback\", on_click=feedback_cb)\n</code></pre>"},{"location":"inference/#ui_utils.update_subset_cols","title":"<code>update_subset_cols(size, missing, classification)</code>","text":"<p>Description: Update the subset columns based on LLM's response</p> <p>Input: size (str), missing (str), classification (str)</p> <p>Returns: cols (list)</p> Source code in <code>frontend/ui_utils.py</code> <pre><code>def update_subset_cols(size, missing, classification):\n    \"\"\"\n    Description: Update the subset columns based on LLM's response\n\n    Input: size (str), missing (str), classification (str)\n\n    Returns: cols (list)\n    \"\"\"\n    cols = [\"did\", \"name\"]\n    if size == \"yes\":\n        cols.append(\"NumberOfInstances\")\n    if missing == \"yes\":\n        cols.append(\"NumberOfMissingValues\")\n    if classification != \"none\":\n        cols.append(\"NumberOfClasses\")\n    return cols\n</code></pre>"},{"location":"query_llm/","title":"LLM Query parsing","text":"<ul> <li>The LLM reads the query and parses it into a list of filters based on a prompt </li> </ul>"},{"location":"query_llm/#llm_service.create_chain","title":"<code>create_chain(prompt, model='llama3', temperature=0)</code>","text":"<p>Description: Create a chain with the given prompt and model</p> <p>Input: prompt (str), model (str), temperature (float)</p> <p>Returns: chain (Chain)</p> Source code in <code>llm_service/llm_service.py</code> <pre><code>def create_chain(prompt, model=\"llama3\", temperature=0):\n    \"\"\"\n    Description: Create a chain with the given prompt and model\n\n    Input: prompt (str), model (str), temperature (float)\n\n    Returns: chain (Chain)\n    \"\"\"\n    llm = ChatOllama(model=model, temperature=temperature)\n    prompt = ChatPromptTemplate.from_template(prompt)\n\n    return prompt | llm | StrOutputParser()\n</code></pre>"},{"location":"query_llm/#llm_service.parse_answers_initial","title":"<code>parse_answers_initial(response)</code>","text":"<p>Description: Parse the answers from the initial response</p> <p>Input: response (str)</p> <p>Returns: answers (list)</p> Source code in <code>llm_service/llm_service.py</code> <pre><code>def parse_answers_initial(response):\n    \"\"\"\n    Description: Parse the answers from the initial response\n\n    Input: response (str)\n\n    Returns: answers (list)\n    \"\"\"\n    patterns = [\n        r\"^(yes|no|none)\",\n        r\"^(ascending|descending)\",\n        r\"(multi-class|binary|multi-label)\"\n    ]\n\n    answers = []\n    lines = response.lower().split(\"\\n\")\n\n    for line in lines:\n        if \"?\" in line:\n            # Extract the part of the line after the question mark\n            potential_answer = line.split(\"?\")[1].strip()\n        else:\n            potential_answer = line.strip()\n\n        # Check if the potential answer matches any of the patterns\n        for pattern in patterns:\n            if re.match(pattern, potential_answer):\n                answers.append(potential_answer)\n                break  # Stop checking other patterns if a match is found\n\n    return answers\n</code></pre>"},{"location":"testing/","title":"Testing","text":""},{"location":"testing/#unit-testing","title":"Unit Testing","text":"<ul> <li>Run <code>python -m unittest tests/unit_testing.py</code> to run the unit tests.</li> </ul>"},{"location":"testing/#load-testing","title":"Load Testing","text":"<ul> <li>Load testing can be done using Locust, a load testing tool that allows you to simulate users querying the API and measure the performance of the API under load from numerous users.</li> <li>It is possible to configure the number of users, the hatch rate, and the time to run the test for.</li> </ul>"},{"location":"testing/#running-the-load-test","title":"Running the load test","text":"<ul> <li>Start the FastAPI server using <code>uvicorn main:app</code> (or <code>./start_local.sh</code> )</li> <li>Load testing using Locust (<code>locust -f tests/locust_test.py --host http://127.0.0.1:8000</code> ) using a different terminal</li> </ul>"},{"location":"testing/#all-tests","title":"All tests","text":"<p>               Bases: <code>TestCase</code></p> Source code in <code>tests/unit_testing.py</code> <pre><code>class TestConfig(unittest.TestCase):\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.client = chromadb.PersistentClient(path=config[\"persist_dir\"])\n        self.config_keys = [\n            \"rqa_prompt_template\",\n            \"llm_prompt_template\",\n            \"num_return_documents\",\n            \"embedding_model\",\n            \"llm_model\",\n            \"num_documents_for_llm\",\n            \"data_dir\",\n            \"persist_dir\",\n            \"testing_flag\",\n            \"ignore_downloading_data\",\n            \"test_subset_2000\",\n            \"data_download_n_jobs\",\n            \"training\",\n            \"temperature\",\n            \"top_p\",\n            \"search_type\",\n            \"reranking\",\n            \"long_context_reorder\",\n        ]\n        self.query_test_dict = {\n            \"dataset\": \"Find me a dataset about flowers that has a high number of instances.\",\n            \"flow\": \"Find me a flow that uses the RandomForestClassifier.\",\n        }\n\n    def test_check_data_dirs(self):\n        \"\"\"\n        Description: Check if the data directory exists.\n        Returns: None\n        \"\"\"\n        self.assertTrue(os.path.exists(config[\"data_dir\"]))\n        self.assertTrue(os.path.exists(config[\"persist_dir\"]))\n\n    def test_config(self):\n        \"\"\"\n        Description: Check if the config has the required keys.\n        Returns: None\n        \"\"\"\n        for key in self.config_keys:\n            self.assertIn(key, config.keys())\n\n    def test_setup_vector_db_and_qa(self):\n        \"\"\"\n        Description: Check if the setup_vector_db_and_qa function works as expected.\n        Returns: None\n        \"\"\"\n        for type_of_data in [\"dataset\", \"flow\"]:\n            self.qa = setup_vector_db_and_qa(\n                config=config, data_type=type_of_data, client=self.client\n            )\n            self.assertIsNotNone(self.qa)\n            self.result_data_frame = get_result_from_query(\n                query=self.query_test_dict[type_of_data],\n                qa=self.qa,\n                type_of_query=type_of_data,\n                config=config,\n            )\n            self.assertIsNotNone(self.result_data_frame)\n</code></pre>"},{"location":"testing/#unit_testing.TestConfig.test_check_data_dirs","title":"<code>test_check_data_dirs()</code>","text":"<p>Description: Check if the data directory exists. Returns: None</p> Source code in <code>tests/unit_testing.py</code> <pre><code>def test_check_data_dirs(self):\n    \"\"\"\n    Description: Check if the data directory exists.\n    Returns: None\n    \"\"\"\n    self.assertTrue(os.path.exists(config[\"data_dir\"]))\n    self.assertTrue(os.path.exists(config[\"persist_dir\"]))\n</code></pre>"},{"location":"testing/#unit_testing.TestConfig.test_config","title":"<code>test_config()</code>","text":"<p>Description: Check if the config has the required keys. Returns: None</p> Source code in <code>tests/unit_testing.py</code> <pre><code>def test_config(self):\n    \"\"\"\n    Description: Check if the config has the required keys.\n    Returns: None\n    \"\"\"\n    for key in self.config_keys:\n        self.assertIn(key, config.keys())\n</code></pre>"},{"location":"testing/#unit_testing.TestConfig.test_setup_vector_db_and_qa","title":"<code>test_setup_vector_db_and_qa()</code>","text":"<p>Description: Check if the setup_vector_db_and_qa function works as expected. Returns: None</p> Source code in <code>tests/unit_testing.py</code> <pre><code>def test_setup_vector_db_and_qa(self):\n    \"\"\"\n    Description: Check if the setup_vector_db_and_qa function works as expected.\n    Returns: None\n    \"\"\"\n    for type_of_data in [\"dataset\", \"flow\"]:\n        self.qa = setup_vector_db_and_qa(\n            config=config, data_type=type_of_data, client=self.client\n        )\n        self.assertIsNotNone(self.qa)\n        self.result_data_frame = get_result_from_query(\n            query=self.query_test_dict[type_of_data],\n            qa=self.qa,\n            type_of_query=type_of_data,\n            config=config,\n        )\n        self.assertIsNotNone(self.result_data_frame)\n</code></pre>"},{"location":"training/","title":"Training","text":"<ul> <li>While we are not creating a new model, we are using the existing model to create embeddings. The name might be misleading but this was chosen as an attempt to keep the naming consistent with other codebases.</li> <li>(Perhaps we might fine tune the model in the future)</li> <li>The training script is present in <code>training.py</code>. Running this script will take care of everything.</li> </ul>"},{"location":"training/#what-does-the-training-script-do","title":"What does the training script do?","text":"<ul> <li>Load the config file and set the necessary variables</li> <li>If <code>testing_flag</code> is set to True, the script will use a subset of the data for quick debugging</li> <li>testing_flag is set to True</li> <li>persist_dir is set to ./data/chroma_db_testing</li> <li>test_subset_2000 is set to True</li> <li>data_dir is set to ./data/testing_data/</li> <li>If <code>testing_flag</code> is set to False, the script will use the entire dataset</li> <li>For all datasets in the OpenML dataset list:</li> <li>Download the dataset</li> <li>Create the vector dataset with computed embeddings</li> <li>Create a vectordb retriever </li> <li>Run some test queries</li> </ul>"},{"location":"developer%20tutorials/","title":"Developer Tutorials","text":"<ul> <li>Hello there, future OpenML contributor! It is nice meeting you here. This page is a collection of tutorials that will help you get started with contributing to the OpenML RAG pipeline.</li> <li>The tutorials show you how to perform common tasks and should make it a lot easier to get started with contributing to this project.</li> <li>Note that you would have had to setup the project before you begin. If you missed this step, please refer to index</li> </ul>"},{"location":"developer%20tutorials/change%20model/","title":"Change model","text":"<pre><code>from __future__ import annotations\nfrom langchain_community.cache import SQLiteCache\nimport os\nimport sys\nimport chromadb\n# change the path to the backend directory\nsys.path.append(os.path.join(os.path.dirname(\".\"), '../../backend/'))\n</code></pre> <pre><code>from modules.utils import load_config_and_device\nfrom modules.llm import setup_vector_db_and_qa\n</code></pre> <pre><code>config = load_config_and_device(\"../../backend/config.json\")\nconfig[\"persist_dir\"] = \"../backend/data/chroma_db/\"\nconfig[\"data_dir\"] = \"../backend/data/\"\nconfig[\"type_of_data\"] = \"dataset\"\nconfig[\"training\"] = True\n# load the persistent database using ChromaDB\nclient = chromadb.PersistentClient(path=config[\"persist_dir\"])\nprint(config)\n</code></pre> <pre>\n<code>[INFO] Finding device.\n[INFO] Device found: cpu\n{'rqa_prompt_template': 'This database is a list of metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the {question} given. If you do not know the answer, say you do not know. {context}', 'llm_prompt_template': 'The following is a set of documents {docs}. Based on these docs, please summarize the content concisely. Also give a list of main concepts found in the documents. Do not add any new information. Helpful Answer: ', 'num_return_documents': 50, 'embedding_model': 'BAAI/bge-large-en-v1.5', 'llm_model': 'qwen2:1.5b', 'num_documents_for_llm': 10, 'data_dir': '../backend/data/', 'persist_dir': '../backend/data/chroma_db/', 'testing_flag': False, 'ignore_downloading_data': False, 'test_subset_2000': False, 'data_download_n_jobs': 20, 'training': True, 'temperature': 0.95, 'top_p': 0.95, 'search_type': 'similarity', 'reranking': False, 'long_context_reorder': False, 'device': 'cpu', 'type_of_data': 'dataset'}\n</code>\n</pre> <pre><code>config[\"embedding_model\"] = \"HuggingFaceH4/capybara\"\n</code></pre> <ul> <li>Pick a model from Ollama - https://ollama.com/library?sort=popular</li> <li>eg : mistral</li> </ul> <pre><code>config[\"llm_model\"] = \"mistral\"\n</code></pre> <pre><code>qa = setup_vector_db_and_qa(\n        config=config, data_type=config[\"type_of_data\"], client=client\n    )\n</code></pre>"},{"location":"developer%20tutorials/change%20model/#tutorial-on-changing-models","title":"Tutorial on changing models","text":"<ul> <li>How would you use a different embedding and llm model?</li> </ul>"},{"location":"developer%20tutorials/change%20model/#initial-config","title":"Initial config","text":""},{"location":"developer%20tutorials/change%20model/#embedding-model","title":"Embedding model","text":"<ul> <li>Pick a model from HF</li> </ul>"},{"location":"developer%20tutorials/change%20model/#llm-model","title":"LLM model","text":""},{"location":"developer%20tutorials/change%20model/#important","title":"IMPORTANT","text":"<ul> <li>Do NOT forget to change the model to the best model in ollama/get_ollama.sh</li> </ul>"},{"location":"developer%20tutorials/create%20vectordb/","title":"Create vectordb","text":"<pre><code>from __future__ import annotations\nfrom langchain.globals import set_llm_cache\nfrom langchain_community.cache import SQLiteCache\nimport os\nimport sys\nimport chromadb\n# change the path to the backend directory\nsys.path.append(os.path.join(os.path.dirname(\".\"), '../../backend/'))\n</code></pre> <pre><code>from modules.utils import get_all_metadata_from_openml, create_metadata_dataframe, load_config_and_device\nfrom modules.llm import load_document_and_create_vector_store, setup_vector_db_and_qa\n</code></pre> <pre><code>config = load_config_and_device(\"../../backend/config.json\")\nconfig[\"persist_dir\"] = \"../backend/data/chroma_db/\"\nconfig[\"data_dir\"] = \"../backend/data/\"\nconfig[\"type_of_data\"] = \"dataset\"\nconfig[\"training\"] = True\n\n# load the persistent database using ChromaDB\nclient = chromadb.PersistentClient(path=config[\"persist_dir\"])\nprint(config)\n</code></pre> <pre>\n<code>[INFO] Finding device.\n[INFO] Device found: cpu\n{'rqa_prompt_template': 'This database is a list of metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the {question} given. If you do not know the answer, say you do not know. {context}', 'llm_prompt_template': 'The following is a set of documents {docs}. Based on these docs, please summarize the content concisely. Also give a list of main concepts found in the documents. Do not add any new information. Helpful Answer: ', 'num_return_documents': 50, 'embedding_model': 'BAAI/bge-large-en-v1.5', 'llm_model': 'qwen2:1.5b', 'num_documents_for_llm': 10, 'data_dir': '../backend/data/', 'persist_dir': '../backend/data/chroma_db/', 'testing_flag': False, 'ignore_downloading_data': False, 'test_subset_2000': False, 'data_download_n_jobs': 20, 'training': True, 'temperature': 0.95, 'top_p': 0.95, 'search_type': 'similarity', 'reranking': False, 'long_context_reorder': False, 'device': 'cpu', 'type_of_data': 'dataset'}\n</code>\n</pre> <pre><code># Download the data if it does not exist\nopenml_data_object, data_id, all_metadata = get_all_metadata_from_openml(\n    config=config\n)\n# Create the combined metadata dataframe\nmetadata_df, all_metadata = create_metadata_dataframe(\n    openml_data_object, data_id, all_metadata, config=config\n)\n# Create the vector store\nvectordb = load_document_and_create_vector_store(\n    metadata_df, config=config, chroma_client=client\n)\n</code></pre> <pre><code>qa = setup_vector_db_and_qa(\n        config=config, data_type=config[\"type_of_data\"], client=client\n    )\n</code></pre>"},{"location":"developer%20tutorials/create%20vectordb/#tutorial-on-creating-a-vector-database-with-openml-objects","title":"Tutorial on creating a vector database with openml objects","text":"<ul> <li>How would you use the API to create a vector database with openml objects (datasets, flows etc)</li> </ul>"},{"location":"developer%20tutorials/create%20vectordb/#manually","title":"Manually","text":""},{"location":"developer%20tutorials/create%20vectordb/#api","title":"API","text":""},{"location":"developer%20tutorials/get%20an%20llm%20summary/","title":"Get an llm summary","text":"<pre><code>from __future__ import annotations\nfrom langchain.globals import set_llm_cache\nfrom langchain_community.cache import SQLiteCache\nimport os\nimport sys\n# change the path to the backend directory\nsys.path.append(os.path.join(os.path.dirname(\".\"), '../../backend/'))\n</code></pre> <pre><code>from modules.llm import get_llm_chain, get_llm_result_from_string\nfrom modules.utils import load_config_and_device\n</code></pre> <pre><code># Config and DB\n\n# load the configuration and device\nconfig = load_config_and_device(\"../../backend/config.json\")\nconfig[\"persist_dir\"] = \"../../backend/data/chroma_db/\"\nconfig[\"data_dir\"] = \"../../backend/data/\"\n</code></pre> <pre>\n<code>[INFO] Finding device.\n[INFO] Device found: cpu\n</code>\n</pre> <pre><code>config[\"llm_prompt_template\"] = \"The following is a set of documents {docs}. Based on these docs, please summarize the content concisely. Also give a list of main concepts found in the documents. Do not add any new information. Helpful Answer: \"\nconfig[\"llm_model\"] = \"qwen2:1.5b\"\n</code></pre> <pre><code># get the llm chain and set the cache\nllm_chain = get_llm_chain(config=config, local=True)\n# use os path to ensure compatibility with all operating systems\nset_llm_cache(SQLiteCache(database_path=os.path.join(config[\"data_dir\"], \".langchain.db\")))\n</code></pre> <pre><code>get_llm_result_from_string(llm_chain, \"This document is about eating disorders and this one is about eating nice food\")\n</code></pre> <pre>\n<code>'Eating Disorders\\n\\n- Eating disorders refer to psychological and emotional conditions characterized by compulsive behaviors such as overeating or excessive restriction.\\n- These behaviors lead to significant weight loss, malnutrition, and serious health complications.\\n\\nEating Nice Food\\n\\n- This document focuses on the importance of eating good food for maintaining a healthy and balanced diet.\\n- It highlights how selecting nutrient-dense foods can aid in overall physical and mental well-being.'</code>\n</pre>"},{"location":"developer%20tutorials/get%20an%20llm%20summary/#getting-an-llm-summary-using-the-api","title":"Getting an LLM summary using the API","text":"<ul> <li>How would you use the API and an LLM model + prompt to generate a summary of the results obtained from the RAG pipeline?</li> </ul>"},{"location":"developer%20tutorials/get%20an%20llm%20summary/#get-llm-summary-of-a-string","title":"Get LLM summary of a string","text":"<ul> <li>Ensure that Ollama is running before this works <code>bash ollama/.get_ollama.sh</code> (or use the desktop Ollama app for testing)</li> <li>As you can tell, the data needs to be a string. To then get the results from a bunch of langchain documents, you must first concatenate the text you care about into a single string.</li> </ul>"},{"location":"developer%20tutorials/load%20vectordb%20and%20get%20results/","title":"Load vectordb and get results","text":"<pre><code>from __future__ import annotations\nfrom langchain.globals import set_llm_cache\nfrom langchain_community.cache import SQLiteCache\nimport os\nimport sys\nimport chromadb\n# change the path to the backend directory\nsys.path.append(os.path.join(os.path.dirname(\".\"), '../../backend/'))\n</code></pre> <pre><code>from modules.utils import load_config_and_device\nfrom modules.llm import setup_vector_db_and_qa\nfrom modules.results_gen import get_result_from_query\n</code></pre> <pre><code># Config and DB\n\n# load the configuration and device\nconfig = load_config_and_device(\"../../backend/config.json\")\nconfig[\"persist_dir\"] = \"../../backend/data/chroma_db/\"\nconfig[\"data_dir\"] = \"../../backend/data/\"\n</code></pre> <pre>\n<code>[INFO] Finding device.\n[INFO] Device found: cpu\n</code>\n</pre> <pre><code># load the persistent database using ChromaDB\nclient = chromadb.PersistentClient(path=config[\"persist_dir\"])\n</code></pre> <pre><code># Setup llm chain, initialize the retriever and llm, and setup Retrieval QA\nqa_dataset = setup_vector_db_and_qa(config=config, data_type=\"dataset\", client=client)\n</code></pre> <pre>\n<code>[INFO] Loading metadata from file.\n[INFO] Loading model...\n</code>\n</pre> <pre>\n<code>/Users/eragon/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n  warnings.warn(\n</code>\n</pre> <pre>\n<code>[INFO] Model loaded.\n</code>\n</pre> <pre><code>query = \"give me datasets about mushrooms\"\n</code></pre> <pre><code>res = qa_dataset.invoke(input = query, top_k=5)[:10]\nres\n</code></pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre>\n<code>[Document(page_content='### **Dataset Details**\\n![](https://meta-album.github.io/assets/img/samples/FNG.png)\\n\\n**Meta Album ID**: PLT.FNG  \\n**Meta Album URL**: [https://meta-album.github.io/datasets/FNG.html](https://meta-album.github.io/datasets/FNG.html)  \\n**Domain ID**: PLT  \\n**Domain Name**: Plants  \\n**Dataset ID**: FNG  \\n**Dataset Name**: Fungi  \\n**Short Description**: Fungi dataset from Denmark  \\n**\\\\# Classes**: 25  \\n**\\\\# Images**: 15122  \\n**Keywords**: fungi, ecology, plants  \\n**Data Format**: images  \\n**Image size**: 128x128  \\n\\n**License (original data release)**: BSD-3-Clause License  \\n**License URL(original data release)**: https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE\\n \\n**License (Meta-Album data release)**: BSD-3-Clause License  \\n**License URL (Meta-Album data release)**: [https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE](https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE)', metadata={'did': 44335, 'name': 'Meta_Album_FNG_Extended'}),\n Document(page_content='### **Dataset Details**\\n![](https://meta-album.github.io/assets/img/samples/FNG.png)\\n\\n**Meta Album ID**: PLT.FNG  \\n**Meta Album URL**: [https://meta-album.github.io/datasets/FNG.html](https://meta-album.github.io/datasets/FNG.html)  \\n**Domain ID**: PLT  \\n**Domain Name**: Plants  \\n**Dataset ID**: FNG  \\n**Dataset Name**: Fungi  \\n**Short Description**: Fungi dataset from Denmark  \\n**\\\\# Classes**: 25  \\n**\\\\# Images**: 1000  \\n**Keywords**: fungi, ecology, plants  \\n**Data Format**: images  \\n**Image size**: 128x128  \\n\\n**License (original data release)**: BSD-3-Clause License  \\n**License URL(original data release)**: https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE\\n \\n**License (Meta-Album data release)**: BSD-3-Clause License  \\n**License URL (Meta-Album data release)**: [https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE](https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE)', metadata={'did': 44302, 'name': 'Meta_Album_FNG_Mini'}),\n Document(page_content=\"### Description\\n\\nThis dataset describes mushrooms in terms of their physical characteristics. They are classified into: poisonous or edible.\\n\\n### Source\\n```\\n(a) Origin: \\nMushroom records are drawn from The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf \\n\\n(b) Donor: \\nJeff Schlimmer (Jeffrey.Schlimmer '@' a.gp.cs.cmu.edu)\\n```\\n\\n### Dataset description\\n\\nThis dataset includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family. Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like ``leaflets three, let it be'' for Poisonous Oak and Ivy.\", metadata={'did': 24, 'name': 'mushroom'}),\n Document(page_content='### **Dataset Details**\\n![](https://meta-album.github.io/assets/img/samples/FNG.png)\\n\\n**Meta Album ID**: PLT.FNG  \\n**Meta Album URL**: [https://meta-album.github.io/datasets/FNG.html](https://meta-album.github.io/datasets/FNG.html)  \\n**Domain ID**: PLT  \\n**Domain Name**: Plants  \\n**Dataset ID**: FNG  \\n**Dataset Name**: Fungi  \\n**Short Description**: Fungi dataset from Denmark  \\n**\\\\# Classes**: 20  \\n**\\\\# Images**: 800  \\n**Keywords**: fungi, ecology, plants  \\n**Data Format**: images  \\n**Image size**: 128x128  \\n\\n**License (original data release)**: BSD-3-Clause License  \\n**License URL(original data release)**: https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE\\n \\n**License (Meta-Album data release)**: BSD-3-Clause License  \\n**License URL (Meta-Album data release)**: [https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE](https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE)', metadata={'did': 44272, 'name': 'Meta_Album_FNG_Micro'}),\n Document(page_content='**Source**: Danish Fungi Dataset  \\n**Source URL**: https://sites.google.com/view/danish-fungi-dataset  \\n  \\n**Original Author**: Lukas Picek, Milan Sulc, Jiri Matas, Jacob Heilmann-Clausen, Thomas S. Jeppesen, Thomas Laessoe, Tobias Froslev  \\n**Original contact**: lukaspicek@gmail.com  \\n\\n**Meta Album author**: Felix Herron  \\n**Created Date**: 01 March 2022  \\n**Contact Name**: Ihsan Ullah  \\n**Contact Email**: meta-album@chalearn.org  \\n**Contact URL**: [https://meta-album.github.io/](https://meta-album.github.io/)  \\n\\n\\n\\n### **Cite this dataset**\\n```\\n@article{picek2021danish,\\n    title={Danish Fungi 2020 - Not Just Another Image Recognition Dataset},\\n    author={Lukas Picek and Milan Sulc and Jiri Matas and Jacob Heilmann-Clausen and Thomas S. Jeppesen and Thomas Laessoe and Tobias Froslev},\\n    year={2021},\\n    eprint={2103.10107},\\n    archivePrefix={arXiv},\\n    primaryClass={cs.CV}\\n}\\n```', metadata={'did': 44272, 'name': 'Meta_Album_FNG_Micro'}),\n Document(page_content='did - 24, name - mushroom, version - 1, uploader - 1, status - active, format - ARFF, MajorityClassSize - 4208.0, MaxNominalAttDistinctValues - 12.0, MinorityClassSize - 3916.0, NumberOfClasses - 2.0, NumberOfFeatures - 23.0, NumberOfInstances - 8124.0, NumberOfInstancesWithMissingValues - 2480.0, NumberOfMissingValues - 2480.0, NumberOfNumericFeatures - 0.0, NumberOfSymbolicFeatures - 23.0, description - **Author**: [Jeff Schlimmer](Jeffrey.Schlimmer@a.gp.cs.cmu.edu)  \\n**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/mushroom) - 1981     \\n**Please cite**:  The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf \\n\\n\\n### Description\\n\\nThis dataset describes mushrooms in terms of their physical characteristics. They are classified into: poisonous or edible.', metadata={'did': 24, 'name': 'mushroom'}),\n Document(page_content='Meta-Album Fungi dataset is created by sampling the Danish Fungi 2020 dataset(https://arxiv.org/abs/2103.10107), itself a sampling of the Atlas of Danish Fungi repository. The images and labels which enter this database are sourced by a group consisting of 3 300 citizen botanists, then verified by their peers using a ranking of each person reliability, then finally verified by experts working at the Atlas. Of the 128 classes in the original Danish Fungi 2020 dataset, FNG retains the 25 most populous classes, belonging to six genera, for a total of 15 122 images total, with min 372, and max 1 221 images per class. Each image contains a colored 128x128 image of a fungus or a piece of a fungus from the corresponding class. Because the initial data were of widely varying sizes, we needed to crop a significant portion of the images, which we implemented by taking the largest possible square with center at the middle of the initial image. We then scaled each squared image to the 128x128', metadata={'did': 44272, 'name': 'Meta_Album_FNG_Micro'}),\n Document(page_content='did - 44272, name - Meta_Album_FNG_Micro, version - 1, uploader - 30980, status - active, format - arff, MajorityClassSize - 40.0, MaxNominalAttDistinctValues - nan, MinorityClassSize - 40.0, NumberOfClasses - 20.0, NumberOfFeatures - 3.0, NumberOfInstances - 800.0, NumberOfInstancesWithMissingValues - 0.0, NumberOfMissingValues - 0.0, NumberOfNumericFeatures - 0.0, NumberOfSymbolicFeatures - 0.0, description - ## **Meta-Album Fungi Dataset (Micro)**\\n***', metadata={'did': 44272, 'name': 'Meta_Album_FNG_Micro'}),\n Document(page_content='did - 44335, name - Meta_Album_FNG_Extended, version - 1, uploader - 30980, status - active, format - arff, MajorityClassSize - 1221.0, MaxNominalAttDistinctValues - nan, MinorityClassSize - 372.0, NumberOfClasses - 25.0, NumberOfFeatures - 3.0, NumberOfInstances - 15122.0, NumberOfInstancesWithMissingValues - 0.0, NumberOfMissingValues - 0.0, NumberOfNumericFeatures - 0.0, NumberOfSymbolicFeatures - 0.0, description - ## **Meta-Album Fungi Dataset (Extended)**\\n***', metadata={'did': 44335, 'name': 'Meta_Album_FNG_Extended'}),\n Document(page_content='did - 43922, name - mushroom, version - 3, uploader - 30861, status - active, format - ARFF, MajorityClassSize - 4208.0, MaxNominalAttDistinctValues - nan, MinorityClassSize - 3916.0, NumberOfClasses - 2.0, NumberOfFeatures - 23.0, NumberOfInstances - 8124.0, NumberOfInstancesWithMissingValues - 0.0, NumberOfMissingValues - 0.0, NumberOfNumericFeatures - 0.0, NumberOfSymbolicFeatures - 23.0, description - Mushroom records drawn from The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf, qualities - AutoCorrelation : 0.726332635725717, Dimensionality : 0.002831117676021664, MajorityClassPercentage : 51.7971442639094, MajorityClassSize : 4208.0, MinorityClassPercentage : 48.20285573609059, MinorityClassSize : 3916.0, NumberOfBinaryFeatures : 6.0, NumberOfClasses : 2.0, NumberOfFeatures : 23.0, NumberOfInstances : 8124.0, NumberOfInstancesWithMissingValues : 0.0, NumberOfMissingValues : 0.0, NumberOfNumericFeatures : 0.0,', metadata={'did': 43922, 'name': 'mushroom'})]</code>\n</pre> <pre><code>res[0].metadata\n</code></pre> <pre>\n<code>{'did': 44335, 'name': 'Meta_Album_FNG_Extended'}</code>\n</pre> <pre><code>print(res[0].page_content)\n</code></pre> <pre>\n<code>### **Dataset Details**\n![](https://meta-album.github.io/assets/img/samples/FNG.png)\n\n**Meta Album ID**: PLT.FNG  \n**Meta Album URL**: [https://meta-album.github.io/datasets/FNG.html](https://meta-album.github.io/datasets/FNG.html)  \n**Domain ID**: PLT  \n**Domain Name**: Plants  \n**Dataset ID**: FNG  \n**Dataset Name**: Fungi  \n**Short Description**: Fungi dataset from Denmark  \n**\\# Classes**: 25  \n**\\# Images**: 15122  \n**Keywords**: fungi, ecology, plants  \n**Data Format**: images  \n**Image size**: 128x128  \n\n**License (original data release)**: BSD-3-Clause License  \n**License URL(original data release)**: https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE\n\n**License (Meta-Album data release)**: BSD-3-Clause License  \n**License URL (Meta-Album data release)**: [https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE](https://github.com/picekl/DanishFungiDataset/blob/main/LICENSE)\n</code>\n</pre> <pre><code># Fetch the result data frame based on the query\nresult_data_frame, result_documents = get_result_from_query(\n    query=query, qa=qa_dataset, type_of_query=\"dataset\", config=config\n)\n</code></pre> <pre>\n<code>Batches:   0%|          | 0/1 [00:00&lt;?, ?it/s]</code>\n</pre> <pre><code>result_data_frame.head()\n</code></pre> id name command OpenML URL Description 0 44335 Meta_Album_FNG_Extended dataset = openml.datasets.get_dataset(44335) &lt;a href=\"https://www.openml.org/search?type=da... did - 44335, name - Meta_Album_FNG_Extended, v... 1 44302 Meta_Album_FNG_Mini dataset = openml.datasets.get_dataset(44302) &lt;a href=\"https://www.openml.org/search?type=da... ### **Dataset Details**\\n![](https://meta-albu... 2 24 mushroom dataset = openml.datasets.get_dataset(24) &lt;a href=\"https://www.openml.org/search?type=da... did - 24, name - mushroom, version - 1, upload... 3 44272 Meta_Album_FNG_Micro dataset = openml.datasets.get_dataset(44272) &lt;a href=\"https://www.openml.org/search?type=da... did - 44272, name - Meta_Album_FNG_Micro, vers... 10 1558 bank-marketing dataset = openml.datasets.get_dataset(1558) &lt;a href=\"https://www.openml.org/search?type=da... * Dataset:"},{"location":"developer%20tutorials/load%20vectordb%20and%20get%20results/#load-the-chroma-db-and-get-retrieval-results-for-a-given-query","title":"Load the Chroma Db and get retrieval results for a given query","text":"<ul> <li>How would you load the Chroma Db and get retrieval results for a given query?</li> </ul>"},{"location":"developer%20tutorials/load%20vectordb%20and%20get%20results/#just-get-documents","title":"Just get documents","text":""},{"location":"developer%20tutorials/load%20vectordb%20and%20get%20results/#process-the-results-and-return-a-dataframe-instead","title":"Process the results and return a dataframe instead","text":""},{"location":"developer%20tutorials/run%20multiple%20queries%20and%20aggregate/","title":"Run multiple queries and aggregate","text":"<pre><code>from __future__ import annotations\nfrom langchain.globals import set_llm_cache\nfrom langchain_community.cache import SQLiteCache\nimport os\nimport sys\nimport chromadb\n# change the path to the backend directory\nsys.path.append(os.path.join(os.path.dirname(\".\"), '../../backend/'))\n</code></pre> <pre><code>from modules.utils import load_config_and_device\nfrom modules.llm import setup_vector_db_and_qa\nfrom modules.results_gen import aggregate_multiple_queries_and_count\n</code></pre> <pre><code># Config and DB\n\n# load the configuration and device\nconfig = load_config_and_device(\"../../backend/config.json\")\nconfig[\"persist_dir\"] = \"../../backend/data/chroma_db/\"\nconfig[\"data_dir\"] = \"../../backend/data/\"\n</code></pre> <pre><code># load the persistent database using ChromaDB\nclient = chromadb.PersistentClient(path=config[\"persist_dir\"])\n</code></pre> <pre><code># Setup llm chain, initialize the retriever and llm, and setup Retrieval QA\nqa_dataset = setup_vector_db_and_qa(config=config, data_type=\"dataset\", client=client)\n</code></pre> <pre><code>queries = [\"Find datasets related to COVID-19\", \"Find datasets related to COVID-19 and India\", \"COVID-19 dataset\", \"COVID-19 dataset India\", \"Mexico historical covid\"]\ncombined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = [\"id\", \"name\"], sort_by=\"query\", count = True)\n</code></pre> <pre><code>combined_df.head()\n</code></pre> <pre><code>queries = [\"Find datasets related to COVID-19\", \"Find datasets related to COVID-19 and India\", \"COVID-19 dataset\", \"COVID-19 dataset India\", \"Mexico historical covid\"]\ncombined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = [\"id\", \"name\"], sort_by=\"query\", count = False)\n</code></pre> <pre><code>combined_df.head()\n</code></pre>"},{"location":"developer%20tutorials/run%20multiple%20queries%20and%20aggregate/#aggregate-results","title":"Aggregate results","text":""},{"location":"developer%20tutorials/run%20multiple%20queries%20and%20aggregate/#just-collate","title":"Just collate","text":""},{"location":"developer%20tutorials/train%20and%20evaluate%20models/","title":"Train and evaluate models","text":"<pre><code>from __future__ import annotations\nfrom langchain_community.cache import SQLiteCache\nimport os\nimport sys\nimport chromadb\nfrom pathlib import Path\nfrom tqdm import tqdm\n\nimport pandas as pd\n# change the path to the backend directory\nsys.path.append(os.path.join(os.path.dirname(\".\"), '../../backend/'))\n</code></pre> <pre><code>from modules.utils import load_config_and_device\nfrom modules.llm import setup_vector_db_and_qa\nfrom modules.results_gen import aggregate_multiple_queries_and_count\n</code></pre> <pre><code>new_path = Path(\"../../backend/\")\n\nconfig = load_config_and_device(str(new_path / \"config.json\"), training = True)\n\nconfig[\"type_of_data\"] = \"dataset\"\nconfig[\"training\"] = True\n</code></pre> <pre><code># list_of_embedding_models = [\"BAAI/bge-small-en-v1.5\", \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\"]\nlist_of_embedding_models = [\"GritLM/GritLM-7B\"]\nlist_of_llm_models = [\"qwen2:1.5b\", \"phi3\"]\n</code></pre> <pre><code>def process_embedding_model_name_hf(name : str) -&amp;gt; str:\n    \"\"\"\n    Description: This function processes the name of the embedding model from Hugging Face to use as experiment name.\n\n    Input: name (str) - name of the embedding model from Hugging Face.\n\n    Returns: name (str) - processed name of the embedding model.\n    \"\"\"\n    return name.replace(\"/\", \"_\")\n\ndef process_llm_model_name_ollama(name : str) -&amp;gt; str:\n    \"\"\"\n    Description: This function processes the name of the llm model from Ollama to use as experiment name.\n\n    Input: name (str) - name of the llm model from Ollama.\n\n    Returns: name (str) - processed name of the llm model.\n    \"\"\"\n    return name.replace(\":\", \"_\")\n</code></pre> <pre><code>queries = [\"Find datasets related to COVID-19\", \"Find datasets related to COVID-19 and India\", \"COVID-19 dataset\", \"COVID-19 dataset India\", \"Mexico historical covid\"]\n</code></pre> <pre><code># download the ollama llm models\n\n# os.system(\"curl -fsSL https://ollama.com/install.sh | sh\")\nos.system(\"ollama serve&amp;amp;\")\nprint(\"Waiting for Ollama server to be active...\")  \nwhile os.system(\"ollama list | grep 'NAME'\") == \"\":\n    pass\n\nfor llm_model in list_of_llm_models:\n    os.system(f\"ollama pull {llm_model}\")\n</code></pre> <pre><code># use a tiny subset of the data for testing\nconfig[\"test_subset_2000\"] = True\n</code></pre> <pre><code>for embedding_model in tqdm(list_of_embedding_models, desc=\"Embedding Models\", total=len(list_of_embedding_models)):\n    for llm_model in tqdm(list_of_llm_models, desc=\"LLM Models\", total=len(list_of_llm_models)):\n        # update the config with the new embedding and llm models\n        config[\"embedding_model\"] = embedding_model\n        config[\"llm_model\"] = llm_model\n\n        # create a new experiment directory using a combination of the embedding model and llm model names\n        experiment_name = f\"{process_embedding_model_name_hf(embedding_model)}_{process_llm_model_name_ollama(llm_model)}\"\n        experiment_path = new_path/Path(f\"data/experiments/{experiment_name}\")\n\n        # create the experiment directory if it does not exist\n        os.makedirs(experiment_path, exist_ok=True)\n\n        # update the config with the new experiment directories\n        config[\"data_dir\"] = str(experiment_path)\n        config[\"persist_dir\"] = str(experiment_path / \"chroma_db\")\n\n        # save training details and config in a dataframe\n        config_df = pd.DataFrame.from_dict(config, orient='index').reset_index()\n        config_df.columns = ['Hyperparameter', 'Value']\n        config_df.to_csv(experiment_path / \"config.csv\", index=False)\n\n        # load the persistent database using ChromaDB\n        client = chromadb.PersistentClient(path=config[\"persist_dir\"])\n\n        # Run \"training\"\n        qa_dataset = setup_vector_db_and_qa(\n            config=config, data_type=config[\"type_of_data\"], client=client\n        )\n\n        # Run an evaluation by aggregating multiple queries and counting the results\n        # TODO : Replace this evaluation with a more meaningful one\n        combined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = [\"id\", \"name\"], sort_by=\"query\", count = True)\n\n        # TODO : ADD LLM evaluation here when the function is ready\n\n        combined_df.to_csv(experiment_path / \"results.csv\")\n</code></pre> <pre><code>\n</code></pre>"},{"location":"developer%20tutorials/train%20and%20evaluate%20models/#tutorial-on-using-multiple-models-for-evaluation","title":"Tutorial on using multiple models for evaluation","text":"<ul> <li>This tutorial is an example of how to test multiple models on the openml data to see which one performs the best.</li> <li>The evaluation is still a bit basic, but it is a good starting point for future research.</li> </ul>"},{"location":"developer%20tutorials/train%20and%20evaluate%20models/#setting-the-config","title":"Setting the config","text":""},{"location":"developer%20tutorials/train%20and%20evaluate%20models/#defining-the-models-used","title":"Defining the models used","text":"<ul> <li>Embedding models are any from Huggingface hub</li> <li>LLM models are any from Ollama library</li> </ul>"},{"location":"developer%20tutorials/train%20and%20evaluate%20models/#defining-the-evaluation-queries","title":"Defining the evaluation queries","text":"<ul> <li>replace this with a proper dataframe for a more comprehensive evaluation</li> </ul>"},{"location":"developer%20tutorials/train%20and%20evaluate%20models/#downloading-the-models","title":"Downloading the models","text":"<ul> <li>PLEASE MAKE SURE YOU HAVE DOWNLOADED OLLAMA (<code>curl -fsSL https://ollama.com/install.sh | sh</code>)</li> </ul>"},{"location":"developer%20tutorials/train%20and%20evaluate%20models/#running-the-steps","title":"Running the steps","text":"<ul> <li>Create an experiment directory</li> <li>Save a config file with the models and the queries in the experiment directory</li> <li>Download openml data for each dataset and format into a string</li> <li>Create vectorb and embed the data</li> <li>Get the predictions for each model for a list of queries and evaluate the performance</li> <li>(note) At the moment, this runs for a very small subset of the entire data. To disable this behavior and run on the entire data, set <code>config[\"test_subset_2000\"] = False</code></li> </ul>"},{"location":"modules/general_utils/","title":"General utils","text":""},{"location":"modules/general_utils/#general_utils.find_device","title":"<code>find_device(training=False)</code>","text":"<p>Description: Find the device to use for the pipeline. If cuda is available, use it. If not, check if MPS is available and use it. If not, use CPU.</p> <p>Input: training (bool) : Whether the pipeline is being used for training or not.</p> <p>Returns: device (str) : The device to use for the pipeline.</p> Source code in <code>backend/modules/general_utils.py</code> <pre><code>def find_device(training: bool = False) -&gt; str:\n    \"\"\"\n    Description: Find the device to use for the pipeline. If cuda is available, use it. If not, check if MPS is available and use it. If not, use CPU.\n\n    Input: training (bool) : Whether the pipeline is being used for training or not.\n\n    Returns: device (str) : The device to use for the pipeline.\n    \"\"\"\n    print(\"[INFO] Finding device.\")\n    if torch.cuda.is_available():\n        return \"cuda\"\n    elif torch.backends.mps.is_available():\n        return \"mps\"\n    else:\n        return \"cpu\"\n</code></pre>"},{"location":"modules/general_utils/#general_utils.load_config_and_device","title":"<code>load_config_and_device(config_file, training=False)</code>","text":"<p>Description: Load the config file and find the device to use for the pipeline.</p> <p>Input: config_file (str) : The path to the config file. training (bool) : Whether the pipeline is being used for training or not.</p> <p>Returns: config (dict) : The config dictionary + device (str) : The device to use for the pipeline.</p> Source code in <code>backend/modules/general_utils.py</code> <pre><code>def load_config_and_device(config_file: str, training: bool = False) -&gt; dict:\n    \"\"\"\n    Description: Load the config file and find the device to use for the pipeline.\n\n    Input: config_file (str) : The path to the config file.\n    training (bool) : Whether the pipeline is being used for training or not.\n\n    Returns: config (dict) : The config dictionary + device (str) : The device to use for the pipeline.\n    \"\"\"\n    # Check if the config file exists and load it\n    if not os.path.exists(config_file):\n        raise Exception(\"Config file does not exist.\")\n    with open(config_file, \"r\") as f:\n        config = json.load(f)\n\n    # Find device and set it in the config between cpu and cuda and mps if available\n    config[\"device\"] = find_device(training)\n    print(f\"[INFO] Device found: {config['device']}\")\n    return config\n</code></pre>"},{"location":"modules/llm_module/","title":"Llm module","text":""},{"location":"modules/llm_module/#llm.add_documents_to_db","title":"<code>add_documents_to_db(db, unique_docs, unique_ids)</code>","text":"<p>Description: Add documents to the vector store in batches of 200.</p> <p>Input: db (Chroma), unique_docs (list), unique_ids (list)</p> <p>Returns: None</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def add_documents_to_db(db, unique_docs, unique_ids):\n    \"\"\"\n    Description: Add documents to the vector store in batches of 200.\n\n    Input: db (Chroma), unique_docs (list), unique_ids (list)\n\n    Returns: None\n    \"\"\"\n    bs = 512\n    if len(unique_docs) &lt; bs:\n        db.add_documents(unique_docs, ids=unique_ids)\n    else:\n        for i in tqdm(range(0, len(unique_docs), bs)):\n            db.add_documents(unique_docs[i : i + bs], ids=unique_ids[i : i + bs])\n</code></pre>"},{"location":"modules/llm_module/#llm.create_vector_store","title":"<code>create_vector_store(metadata_df, chroma_client, config, embeddings, collection_name)</code>","text":"<p>Description: Create the vector store using Chroma db. The documents are loaded and processed, unique documents are generated, and the documents are added to the vector store.</p> <p>Input: metadata_df (pd.DataFrame), chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)</p> <p>Returns: db (Chroma)</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def create_vector_store(\n    metadata_df: pd.DataFrame,\n    chroma_client: ClientAPI,\n    config: dict,\n    embeddings: HuggingFaceEmbeddings,\n    collection_name: str,\n) -&gt; Chroma:\n    \"\"\"\n    Description: Create the vector store using Chroma db. The documents are loaded and processed, unique documents are generated, and the documents are added to the vector store.\n\n    Input: metadata_df (pd.DataFrame), chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)\n\n    Returns: db (Chroma)\n    \"\"\"\n\n    db = Chroma(\n        client=chroma_client,\n        embedding_function=embeddings,\n        persist_directory=config[\"persist_dir\"],\n        collection_name=collection_name,\n    )\n\n    documents = load_and_process_data(\n        metadata_df, page_content_column=\"Combined_information\"\n    )\n    if config[\"testing_flag\"]:\n        # subset the data for testing\n        if config[\"test_subset_2000\"] == True:\n            print(\"[INFO] Subsetting the data to 100 rows.\")\n            documents = documents[:500]\n    unique_docs, unique_ids = generate_unique_documents(documents, db)\n\n    print(\n        f\"Number of unique documents: {len(unique_docs)} vs Total documents: {len(documents)}\"\n    )\n    if len(unique_docs) == 0:\n        print(\"No new documents to add.\")\n        return db\n    else:\n        # db.add_documents(unique_docs, ids=unique_ids)\n        add_documents_to_db(db, unique_docs, unique_ids)\n\n    return db\n</code></pre>"},{"location":"modules/llm_module/#llm.generate_unique_documents","title":"<code>generate_unique_documents(documents, db)</code>","text":"Generate unique documents by removing duplicates. This is done by generating unique IDs for the documents and keeping only one of the duplicate IDs. <p>Source: https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist</p> <p>Input: documents (list)</p> <p>Returns: unique_docs (list), unique_ids (list)</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def generate_unique_documents(documents: list, db: Chroma) -&gt; tuple:\n    \"\"\"\n    Description: Generate unique documents by removing duplicates. This is done by generating unique IDs for the documents and keeping only one of the duplicate IDs.\n        Source: https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist\n\n    Input: documents (list)\n\n    Returns: unique_docs (list), unique_ids (list)\n    \"\"\"\n\n    # Remove duplicates based on ID (from database)\n    new_document_ids = set([str(x.metadata[\"did\"]) for x in documents])\n    print(f\"[INFO] Generating unique documents. Total documents: {len(documents)}\")\n    try:\n        old_dids = set([str(x[\"did\"]) for x in db.get()[\"metadatas\"]])\n    except KeyError:\n        old_dids = set([str(x[\"id\"]) for x in db.get()[\"metadatas\"]])\n\n    new_dids = new_document_ids - old_dids\n    documents = [x for x in documents if str(x.metadata[\"did\"]) in new_dids]\n    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in documents]\n\n    # Remove duplicates based on document content (from new documents)\n    unique_ids = list(set(ids))\n    seen_ids = set()\n    unique_docs = [\n        doc\n        for doc, id in zip(documents, ids)\n        if id not in seen_ids and (seen_ids.add(id) or True)\n    ]\n\n    return unique_docs, unique_ids\n</code></pre>"},{"location":"modules/llm_module/#llm.get_collection_name","title":"<code>get_collection_name(config)</code>","text":"<p>Description: Get the collection name based on the type of data provided in the config.</p> <p>Input: config (dict)</p> <p>Returns: str</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def get_collection_name(config: dict) -&gt; str:\n    \"\"\"\n    Description: Get the collection name based on the type of data provided in the config.\n\n    Input: config (dict)\n\n    Returns: str\n    \"\"\"\n    return {\"dataset\": \"datasets\", \"flow\": \"flows\"}.get(\n        config[\"type_of_data\"], \"default\"\n    )\n</code></pre>"},{"location":"modules/llm_module/#llm.get_llm_chain","title":"<code>get_llm_chain(config, local=False)</code>","text":"<p>Description: Get the LLM chain with the specified model and prompt template.</p> <p>Input: config (dict)</p> <p>Returns: LLMChain</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def get_llm_chain(config: dict, local: bool = False) -&gt; LLMChain | bool:\n    \"\"\"\n    Description: Get the LLM chain with the specified model and prompt template.\n\n    Input: config (dict)\n\n    Returns: LLMChain\n    \"\"\"\n    base_url = \"http://127.0.0.1:11434\" if local else \"http://ollama:11434\"\n    llm = Ollama(model=config[\"llm_model\"], base_url=base_url)\n    # llm = Ollama(\n    # model = config[\"llm_model\"]\n    # )\n    # print(llm)\n    map_template = config[\"llm_prompt_template\"]\n    map_prompt = PromptTemplate.from_template(map_template)\n    # return LLMChain(llm=llm, prompt=map_prompt)\n    return map_prompt | llm | StrOutputParser()\n</code></pre>"},{"location":"modules/llm_module/#llm.initialize_llm_chain","title":"<code>initialize_llm_chain(vectordb, config)</code>","text":"<p>Description: Initialize the LLM chain and setup Retrieval QA with the specified configuration.</p> <p>Input: vectordb (Chroma), config (dict)</p> <p>Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def initialize_llm_chain(\n    vectordb: Chroma, config: dict\n) -&gt; langchain.chains.retrieval_qa.base.RetrievalQA:\n    \"\"\"\n    Description: Initialize the LLM chain and setup Retrieval QA with the specified configuration.\n\n    Input: vectordb (Chroma), config (dict)\n\n    Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)\n    \"\"\"\n\n    return vectordb.as_retriever(\n        search_type=config[\"search_type\"],\n        search_kwargs={\"k\": config[\"num_return_documents\"]},\n    )\n</code></pre>"},{"location":"modules/llm_module/#llm.load_and_process_data","title":"<code>load_and_process_data(metadata_df, page_content_column)</code>","text":"<p>Description: Load and process the data for the vector store. Split the documents into chunks of 1000 characters.</p> <p>Input: metadata_df (pd.DataFrame), page_content_column (str)</p> <p>Returns: chunked documents (list)</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def load_and_process_data(metadata_df: pd.DataFrame, page_content_column: str) -&gt; list:\n    \"\"\"\n    Description: Load and process the data for the vector store. Split the documents into chunks of 1000 characters.\n\n    Input: metadata_df (pd.DataFrame), page_content_column (str)\n\n    Returns: chunked documents (list)\n    \"\"\"\n    # Load data\n    loader = DataFrameLoader(metadata_df, page_content_column=page_content_column)\n    documents = loader.load()\n\n    # Split documents\n    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)\n    documents = text_splitter.split_documents(documents)\n\n    return documents\n</code></pre>"},{"location":"modules/llm_module/#llm.load_document_and_create_vector_store","title":"<code>load_document_and_create_vector_store(metadata_df, chroma_client, config)</code>","text":"<p>Loads the documents and creates the vector store. If the training flag is set to True, the documents are added to the vector store. If the training flag is set to False, the vector store is loaded from the persist directory.</p> <p>Parameters:</p> Name Type Description Default <code>metadata_df</code> <code>DataFrame</code> <p>The metadata dataframe.</p> required <code>chroma_client</code> <code>PersistentClient</code> <p>The Chroma client.</p> required <code>config</code> <code>dict</code> <p>The configuration dictionary.</p> required <p>Returns:</p> Name Type Description <code>Chroma</code> <code>Chroma</code> <p>The Chroma vector store.</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def load_document_and_create_vector_store(\n    metadata_df: pd.DataFrame, chroma_client: ClientAPI, config: dict\n) -&gt; Chroma:\n    \"\"\"\n    Loads the documents and creates the vector store. If the training flag is set to True,\n    the documents are added to the vector store. If the training flag is set to False,\n    the vector store is loaded from the persist directory.\n\n    Args:\n        metadata_df (pd.DataFrame): The metadata dataframe.\n        chroma_client (chromadb.PersistentClient): The Chroma client.\n        config (dict): The configuration dictionary.\n\n    Returns:\n        Chroma: The Chroma vector store.\n    \"\"\"\n    embeddings = load_model(config)\n    collection_name = get_collection_name(config)\n\n    if not config[\"training\"]:\n        return load_vector_store(chroma_client, config, embeddings, collection_name)\n\n    return create_vector_store(\n        metadata_df, chroma_client, config, embeddings, collection_name\n    )\n</code></pre>"},{"location":"modules/llm_module/#llm.load_model","title":"<code>load_model(config)</code>","text":"<p>Description: Load the model using HuggingFaceEmbeddings.</p> <p>Input: config (dict)</p> <p>Returns: HuggingFaceEmbeddings</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def load_model(config: dict) -&gt; HuggingFaceEmbeddings | None:\n    \"\"\"\n    Description: Load the model using HuggingFaceEmbeddings.\n\n    Input: config (dict)\n\n    Returns: HuggingFaceEmbeddings\n    \"\"\"\n    print(\"[INFO] Loading model...\")\n    model_kwargs = {\"device\": config[\"device\"], \"trust_remote_code\": True}\n    encode_kwargs = {\"normalize_embeddings\": True}\n    embeddings = HuggingFaceEmbeddings(\n        model_name=config[\"embedding_model\"],\n        model_kwargs=model_kwargs,\n        encode_kwargs=encode_kwargs,\n        show_progress=True,\n        # trust_remote_code=True\n    )\n    print(\"[INFO] Model loaded.\")\n    return embeddings\n</code></pre>"},{"location":"modules/llm_module/#llm.load_vector_store","title":"<code>load_vector_store(chroma_client, config, embeddings, collection_name)</code>","text":"<p>Description: Load the vector store from the persist directory.</p> <p>Input: chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)</p> <p>Returns: Chroma</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def load_vector_store(\n    chroma_client: ClientAPI,\n    config: dict,\n    embeddings: HuggingFaceEmbeddings,\n    collection_name: str,\n) -&gt; Chroma:\n    \"\"\"\n    Description: Load the vector store from the persist directory.\n\n    Input: chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)\n\n    Returns: Chroma\n    \"\"\"\n    if not os.path.exists(config[\"persist_dir\"]):\n        raise Exception(\n            \"Persist directory does not exist. Please run the training pipeline first.\"\n        )\n\n    return Chroma(\n        client=chroma_client,\n        persist_directory=config[\"persist_dir\"],\n        embedding_function=embeddings,\n        collection_name=collection_name,\n    )\n</code></pre>"},{"location":"modules/llm_module/#llm.setup_vector_db_and_qa","title":"<code>setup_vector_db_and_qa(config, data_type, client)</code>","text":"<p>Description: Create the vector database using Chroma db with each type of data in its own collection. Doing so allows us to have a single database with multiple collections, reducing the number of databases we need to manage. This also downloads the embedding model if it does not exist. The QA chain is then initialized with the vector store and the configuration.</p> <p>Input: config (dict), data_type (str), client (chromadb.PersistentClient)</p> <p>Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)</p> Source code in <code>backend/modules/llm.py</code> <pre><code>def setup_vector_db_and_qa(\n    config: dict, data_type: str, client: ClientAPI\n) -&gt; Union[langchain.chains.retrieval_qa.base.RetrievalQA, pd.DataFrame]:\n    \"\"\"\n    Description: Create the vector database using Chroma db with each type of data in its own collection. Doing so allows us to have a single database with multiple collections, reducing the number of databases we need to manage.\n    This also downloads the embedding model if it does not exist. The QA chain is then initialized with the vector store and the configuration.\n\n    Input: config (dict), data_type (str), client (chromadb.PersistentClient)\n\n    Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)\n    \"\"\"\n\n    config[\"type_of_data\"] = data_type\n\n    # Download the data if it does not exist\n    openml_data_object, data_id, all_metadata, handler = get_all_metadata_from_openml(\n        config=config\n    )\n    # Create the combined metadata dataframe\n    metadata_df, all_metadata = create_metadata_dataframe(\n        handler, openml_data_object, data_id, all_metadata, config=config\n    )\n    # Create the vector store\n    vectordb = load_document_and_create_vector_store(\n        metadata_df, config=config, chroma_client=client\n    )\n    # Initialize the LLM chain and setup Retrieval QA\n    qa = initialize_llm_chain(vectordb=vectordb, config=config)\n    return qa, all_metadata\n</code></pre>"},{"location":"modules/metadata_module/","title":"Metadata module","text":""},{"location":"modules/metadata_module/#metadata_utils.OpenMLDatasetHandler","title":"<code>OpenMLDatasetHandler</code>","text":"<p>               Bases: <code>OpenMLObjectHandler</code></p> <p>Description: The class for handling OpenML dataset objects.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>class OpenMLDatasetHandler(OpenMLObjectHandler):\n    \"\"\"\n    Description: The class for handling OpenML dataset objects.\n    \"\"\"\n\n    def get_description(self, data_id: int):\n        return openml.datasets.get_dataset(\n            dataset_id=data_id,\n            download_data=False,\n            download_qualities=True,\n            download_features_meta_data=True,\n        )\n\n    def get_openml_objects(self):\n        return openml.datasets.list_datasets(output_format=\"dataframe\")\n\n    def process_metadata(\n        self,\n        openml_data_object: Sequence[openml.datasets.dataset.OpenMLDataset],\n        data_id: Sequence[int],\n        all_dataset_metadata: pd.DataFrame,\n        file_path: str,\n    ):\n        descriptions = [\n            extract_attribute(attr, \"description\") for attr in openml_data_object\n        ]\n        joined_qualities = [\n            join_attributes(attr, \"qualities\") for attr in openml_data_object\n        ]\n        joined_features = [\n            join_attributes(attr, \"features\") for attr in openml_data_object\n        ]\n\n        all_data_description_df = create_combined_information_df(\n            data_id, descriptions, joined_qualities, joined_features\n        )\n        all_dataset_metadata = combine_metadata(\n            all_dataset_metadata, all_data_description_df\n        )\n\n        all_dataset_metadata.to_csv(file_path)\n\n        return (\n            all_dataset_metadata[[\"did\", \"name\", \"Combined_information\"]],\n            all_dataset_metadata,\n        )\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.OpenMLFlowHandler","title":"<code>OpenMLFlowHandler</code>","text":"<p>               Bases: <code>OpenMLObjectHandler</code></p> <p>Description: The class for handling OpenML flow objects.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>class OpenMLFlowHandler(OpenMLObjectHandler):\n    \"\"\"\n    Description: The class for handling OpenML flow objects.\n    \"\"\"\n\n    def get_description(self, data_id: int):\n        return openml.flows.get_flow(flow_id=data_id)\n\n    def get_openml_objects(self):\n        all_objects = openml.flows.list_flows(output_format=\"dataframe\")\n        return all_objects.rename(columns={\"id\": \"did\"})\n\n    def process_metadata(\n        self,\n        openml_data_object: Sequence[openml.flows.flow.OpenMLFlow],\n        data_id: Sequence[int],\n        all_dataset_metadata: pd.DataFrame,\n        file_path: str,\n    ):\n        descriptions = [\n            extract_attribute(attr, \"description\") for attr in openml_data_object\n        ]\n        names = [extract_attribute(attr, \"name\") for attr in openml_data_object]\n        tags = [extract_attribute(attr, \"tags\") for attr in openml_data_object]\n\n        all_data_description_df = pd.DataFrame(\n            {\n                \"did\": data_id,\n                \"description\": descriptions,\n                \"name\": names,\n                \"tags\": tags,\n            }\n        )\n\n        all_data_description_df[\"Combined_information\"] = all_data_description_df.apply(\n            merge_all_columns_to_string, axis=1\n        )\n        all_data_description_df.to_csv(file_path)\n\n        return (\n            all_data_description_df[[\"did\", \"name\", \"Combined_information\"]],\n            all_data_description_df,\n        )\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.OpenMLObjectHandler","title":"<code>OpenMLObjectHandler</code>","text":"<p>Description: The base class for handling OpenML objects.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>class OpenMLObjectHandler:\n    \"\"\"\n    Description: The base class for handling OpenML objects.\n    \"\"\"\n\n    def __init__(self, config):\n        self.config = config\n        self.collection_name = \"\"\n\n    def get_description(self, data_id: int):\n        \"\"\"\n        Description: Get the description of the OpenML object.\n\n        Input: data_id (int) : The data id\n\n        Returns: The OpenML object.\n        \"\"\"\n        raise NotImplementedError\n\n    def get_openml_objects(self):\n        \"\"\"\n        Description: Get the OpenML objects.\n\n        Input: None\n\n        Returns: The OpenML objects.\n        \"\"\"\n        raise NotImplementedError\n\n    def initialize_cache(self, data_id: Sequence[int]) -&gt; None:\n        \"\"\"\n        Description: Initialize the cache for the OpenML objects.\n\n        Input: data_id (list) : The list of data ids\n\n        Returns: None\n        \"\"\"\n        self.get_description(data_id[0])\n\n    def get_metadata(self, data_id: Sequence[int]):\n        \"\"\"\n        Description: Get metadata from OpenML using parallel processing.\n\n        Input: data_id (list) : The list of data ids\n\n        Returns: The OpenML objects.\n        \"\"\"\n        return pqdm(\n            data_id, self.get_description, n_jobs=self.config[\"data_download_n_jobs\"]\n        )\n\n    def process_metadata(\n        self,\n        openml_data_object,\n        data_id: Sequence[int],\n        all_dataset_metadata: pd.DataFrame,\n        file_path: str,\n    ):\n        \"\"\"\n        Description: Process the metadata.\n\n        Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path\n\n        Returns: The combined metadata dataframe and the updated metadata table.\n        \"\"\"\n        raise NotImplementedError\n\n    def load_metadata(self, file_path: str):\n        \"\"\"\n        Description: Load metadata from a file.\n\n        Input: file_path (str) : The file path\n\n        Returns: The metadata dataframe.\n        \"\"\"\n        try:\n            return pd.read_csv(file_path)\n        except FileNotFoundError:\n            raise Exception(\n                \"Metadata files do not exist. Please run the training pipeline first.\"\n            )\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.OpenMLObjectHandler.get_description","title":"<code>get_description(data_id)</code>","text":"<p>Description: Get the description of the OpenML object.</p> <p>Input: data_id (int) : The data id</p> <p>Returns: The OpenML object.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def get_description(self, data_id: int):\n    \"\"\"\n    Description: Get the description of the OpenML object.\n\n    Input: data_id (int) : The data id\n\n    Returns: The OpenML object.\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.OpenMLObjectHandler.get_metadata","title":"<code>get_metadata(data_id)</code>","text":"<p>Description: Get metadata from OpenML using parallel processing.</p> <p>Input: data_id (list) : The list of data ids</p> <p>Returns: The OpenML objects.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def get_metadata(self, data_id: Sequence[int]):\n    \"\"\"\n    Description: Get metadata from OpenML using parallel processing.\n\n    Input: data_id (list) : The list of data ids\n\n    Returns: The OpenML objects.\n    \"\"\"\n    return pqdm(\n        data_id, self.get_description, n_jobs=self.config[\"data_download_n_jobs\"]\n    )\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.OpenMLObjectHandler.get_openml_objects","title":"<code>get_openml_objects()</code>","text":"<p>Description: Get the OpenML objects.</p> <p>Input: None</p> <p>Returns: The OpenML objects.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def get_openml_objects(self):\n    \"\"\"\n    Description: Get the OpenML objects.\n\n    Input: None\n\n    Returns: The OpenML objects.\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.OpenMLObjectHandler.initialize_cache","title":"<code>initialize_cache(data_id)</code>","text":"<p>Description: Initialize the cache for the OpenML objects.</p> <p>Input: data_id (list) : The list of data ids</p> <p>Returns: None</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def initialize_cache(self, data_id: Sequence[int]) -&gt; None:\n    \"\"\"\n    Description: Initialize the cache for the OpenML objects.\n\n    Input: data_id (list) : The list of data ids\n\n    Returns: None\n    \"\"\"\n    self.get_description(data_id[0])\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.OpenMLObjectHandler.load_metadata","title":"<code>load_metadata(file_path)</code>","text":"<p>Description: Load metadata from a file.</p> <p>Input: file_path (str) : The file path</p> <p>Returns: The metadata dataframe.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def load_metadata(self, file_path: str):\n    \"\"\"\n    Description: Load metadata from a file.\n\n    Input: file_path (str) : The file path\n\n    Returns: The metadata dataframe.\n    \"\"\"\n    try:\n        return pd.read_csv(file_path)\n    except FileNotFoundError:\n        raise Exception(\n            \"Metadata files do not exist. Please run the training pipeline first.\"\n        )\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.OpenMLObjectHandler.process_metadata","title":"<code>process_metadata(openml_data_object, data_id, all_dataset_metadata, file_path)</code>","text":"<p>Description: Process the metadata.</p> <p>Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path</p> <p>Returns: The combined metadata dataframe and the updated metadata table.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def process_metadata(\n    self,\n    openml_data_object,\n    data_id: Sequence[int],\n    all_dataset_metadata: pd.DataFrame,\n    file_path: str,\n):\n    \"\"\"\n    Description: Process the metadata.\n\n    Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path\n\n    Returns: The combined metadata dataframe and the updated metadata table.\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.combine_metadata","title":"<code>combine_metadata(all_dataset_metadata, all_data_description_df)</code>","text":"<p>Description: Combine the descriptions with the metadata table.</p> <p>Input: all_dataset_metadata (pd.DataFrame) : The metadata table, all_data_description_df (pd.DataFrame) : The descriptions</p> <p>Returns: The combined metadata table.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def combine_metadata(\n    all_dataset_metadata: pd.DataFrame, all_data_description_df: pd.DataFrame\n) -&gt; pd.DataFrame:\n    \"\"\"\n    Description: Combine the descriptions with the metadata table.\n\n    Input: all_dataset_metadata (pd.DataFrame) : The metadata table,\n    all_data_description_df (pd.DataFrame) : The descriptions\n\n    Returns: The combined metadata table.\n    \"\"\"\n    # Combine the descriptions with the metadata table\n    all_dataset_metadata = pd.merge(\n        all_dataset_metadata, all_data_description_df, on=\"did\", how=\"inner\"\n    )\n\n    # Create a single column that has a combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"\n\n    all_dataset_metadata[\"Combined_information\"] = all_dataset_metadata.apply(\n        merge_all_columns_to_string, axis=1\n    )\n    return all_dataset_metadata\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.create_combined_information_df","title":"<code>create_combined_information_df(data_id, descriptions, joined_qualities, joined_features)</code>","text":"<p>Description: Create a dataframe with the combined information of the OpenML object.</p> <p>Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object</p> <p>Returns: The dataframe with the combined information of the OpenML object.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def create_combined_information_df(\n    # data_id, descriptions, joined_qualities, joined_features\n    data_id: int | Sequence[int],\n    descriptions: Sequence[str],\n    joined_qualities: Sequence[str],\n    joined_features: Sequence[str],\n) -&gt; pd.DataFrame:\n    \"\"\"\n    Description: Create a dataframe with the combined information of the OpenML object.\n\n    Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object\n\n    Returns: The dataframe with the combined information of the OpenML object.\n    \"\"\"\n    return pd.DataFrame(\n        {\n            \"did\": data_id,\n            \"description\": descriptions,\n            \"qualities\": joined_qualities,\n            \"features\": joined_features,\n        }\n    )\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.create_metadata_dataframe","title":"<code>create_metadata_dataframe(handler, openml_data_object, data_id, all_dataset_metadata, config)</code>","text":"<p>Description: Creates a dataframe with all the metadata, joined columns with all information for the type of data specified in the config. If training is set to False, the dataframes are loaded from the files. If training is set to True, the dataframes are created and then saved to the files.</p> Input <p>handler (OpenMLObjectHandler): The handler for the OpenML objects. openml_data_object (list): The list of OpenML objects. data_id (list): The list of data ids. all_dataset_metadata (pd.DataFrame): The metadata table. config (dict): The config dictionary.</p> <p>Returns:</p> Type Description <code>DataFrame</code> <p>pd.DataFrame: The combined metadata dataframe.</p> <code>DataFrame</code> <p>pd.DataFrame: The updated metadata table.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def create_metadata_dataframe(\n    # openml_data_object, data_id, all_dataset_metadata, config\n    handler: OpenMLObjectHandler,\n    openml_data_object: Sequence[\n        Union[openml.datasets.dataset.OpenMLDataset, openml.flows.flow.OpenMLFlow]\n    ],\n    data_id: Sequence[int],\n    all_dataset_metadata: pd.DataFrame,\n    config: dict,\n) -&gt; Tuple[pd.DataFrame, pd.DataFrame]:\n    \"\"\"\n    Description: Creates a dataframe with all the metadata, joined columns with all information\n    for the type of data specified in the config. If training is set to False,\n    the dataframes are loaded from the files. If training is set to True, the\n    dataframes are created and then saved to the files.\n\n    Input:\n        handler (OpenMLObjectHandler): The handler for the OpenML objects.\n        openml_data_object (list): The list of OpenML objects.\n        data_id (list): The list of data ids.\n        all_dataset_metadata (pd.DataFrame): The metadata table.\n        config (dict): The config dictionary.\n\n    Returns:\n        pd.DataFrame: The combined metadata dataframe.\n        pd.DataFrame: The updated metadata table.\n    \"\"\"\n    # use os.path.join to ensure compatibility with different operating systems\n    file_path = os.path.join(\n        config[\"data_dir\"], f\"all_{config['type_of_data']}_description.csv\"\n    )\n\n    if not config[\"training\"]:\n        return handler.load_metadata(file_path), all_dataset_metadata\n\n    return handler.process_metadata(\n        openml_data_object, data_id, all_dataset_metadata, file_path\n    )\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.extract_attribute","title":"<code>extract_attribute(attribute, attr_name)</code>","text":"<p>Description: Extract an attribute from the OpenML object.</p> <p>Input: attribute (object) : The OpenML object</p> <p>Returns: The attribute value if it exists, else an empty string.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def extract_attribute(attribute: object, attr_name: str) -&gt; str:\n    \"\"\"\n    Description: Extract an attribute from the OpenML object.\n\n    Input: attribute (object) : The OpenML object\n\n    Returns: The attribute value if it exists, else an empty string.\n    \"\"\"\n    return getattr(attribute, attr_name, \"\")\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.get_all_metadata_from_openml","title":"<code>get_all_metadata_from_openml(config)</code>","text":"<p>Description: Gets all the metadata from OpenML for the type of data specified in the config. If training is set to False, it loads the metadata from the files. If training is set to True, it gets the metadata from OpenML.</p> <p>This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.</p> <p>Input: config (dict) : The config dictionary</p> <p>Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def get_all_metadata_from_openml(\n    config: dict,\n) -&gt; Tuple[pd.DataFrame, Sequence[int], pd.DataFrame] | None:\n    \"\"\"\n    Description: Gets all the metadata from OpenML for the type of data specified in the config.\n    If training is set to False, it loads the metadata from the files. If training is set to True, it gets the metadata from OpenML.\n\n    This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.\n\n\n    Input: config (dict) : The config dictionary\n\n    Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.\n    \"\"\"\n\n    # save_filename = f\"./data/all_{config['type_of_data']}_metadata.pkl\"\n    # use os.path.join to ensure compatibility with different operating systems\n    save_filename = os.path.join(\n        config[\"data_dir\"], f\"all_{config['type_of_data']}_metadata.pkl\"\n    )\n    # If we are not training, we do not need to recreate the cache and can load the metadata from the files. If the files do not exist, raise an exception.\n    # TODO : Check if this behavior is correct, or if data does not exist, send to training pipeline?\n    if config[\"training\"] == False or config[\"ignore_downloading_data\"] == True:\n        # print(\"[INFO] Training is set to False.\")\n        # Check if the metadata files exist for all types of data\n        if not os.path.exists(save_filename):\n            raise Exception(\n                \"Metadata files do not exist. Please run the training pipeline first.\"\n            )\n        print(\"[INFO] Loading metadata from file.\")\n        # Load the metadata files for all types of data\n        return load_metadata_from_file(save_filename)\n\n    # If we are training, we need to recreate the cache and get the metadata from OpenML\n    if config[\"training\"] == True:\n        print(\"[INFO] Training is set to True.\")\n        # Gather all OpenML objects of the type of data\n        handler = (\n            OpenMLDatasetHandler(config)\n            if config[\"type_of_data\"] == \"dataset\"\n            else OpenMLFlowHandler(config)\n        )\n\n        all_objects = handler.get_openml_objects()\n\n        # subset the data for testing\n        if config[\"test_subset_2000\"] == True:\n            print(\"[INFO] Subsetting the data to 100 rows.\")\n            all_objects = all_objects[:500]\n\n        data_id = [int(all_objects.iloc[i][\"did\"]) for i in range(len(all_objects))]\n\n        print(\"[INFO] Initializing cache.\")\n        handler.initialize_cache(data_id)\n\n        print(f\"[INFO] Getting {config['type_of_data']} metadata from OpenML.\")\n        openml_data_object = handler.get_metadata(data_id)\n\n        print(\"[INFO] Saving metadata to file.\")\n        save_metadata_to_file(\n            (openml_data_object, data_id, all_objects, handler), save_filename\n        )\n\n        return openml_data_object, data_id, all_objects, handler\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.join_attributes","title":"<code>join_attributes(attribute, attr_name)</code>","text":"<p>Description: Join the attributes of the OpenML object.</p> <p>Input: attribute (object) : The OpenML object</p> <p>Returns: The joined attributes if they exist, else an empty string. example: \"column - value, column - value, ...\"</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def join_attributes(attribute: object, attr_name: str) -&gt; str:\n    \"\"\"\n    Description: Join the attributes of the OpenML object.\n\n    Input: attribute (object) : The OpenML object\n\n    Returns: The joined attributes if they exist, else an empty string.\n    example: \"column - value, column - value, ...\"\n    \"\"\"\n\n    return (\n        \" \".join([f\"{k} : {v},\" for k, v in getattr(attribute, attr_name, {}).items()])\n        if hasattr(attribute, attr_name)\n        else \"\"\n    )\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.load_metadata_from_file","title":"<code>load_metadata_from_file(save_filename)</code>","text":"<p>Load metadata from a file.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def load_metadata_from_file(\n    save_filename: str,\n) -&gt; Tuple[pd.DataFrame, Sequence[int], pd.DataFrame]:\n    \"\"\"\n    Load metadata from a file.\n    \"\"\"\n    with open(save_filename, \"rb\") as f:\n        return pickle.load(f)\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.merge_all_columns_to_string","title":"<code>merge_all_columns_to_string(row)</code>","text":"<p>Description: Create a single column that has a combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"</p> <p>Input: row (pd.Series) : The row of the dataframe</p> <p>Returns: The combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def merge_all_columns_to_string(row: pd.Series) -&gt; str:\n    \"\"\"\n    Description: Create a single column that has a combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"\n\n    Input: row (pd.Series) : The row of the dataframe\n\n    Returns: The combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"\n    \"\"\"\n\n    return \" \".join([f\"{col} - {val},\" for col, val in zip(row.index, row.values)])\n</code></pre>"},{"location":"modules/metadata_module/#metadata_utils.save_metadata_to_file","title":"<code>save_metadata_to_file(data, save_filename)</code>","text":"<p>Save metadata to a file.</p> Source code in <code>backend/modules/metadata_utils.py</code> <pre><code>def save_metadata_to_file(data, save_filename: str):\n    \"\"\"\n    Save metadata to a file.\n    \"\"\"\n    with open(save_filename, \"wb\") as f:\n        pickle.dump(data, f)\n</code></pre>"},{"location":"modules/result_gen/","title":"Result gen","text":""},{"location":"modules/result_gen/#results_gen.aggregate_multiple_queries_and_count","title":"<code>aggregate_multiple_queries_and_count(queries, qa_dataset, config, group_cols=['id', 'name'], sort_by='query', count=True)</code>","text":"<p>Description: Aggregate the results of multiple queries into a single dataframe and count the number of times a dataset appears in the results</p> Input <p>queries: List of queries group_cols: List of columns to group by</p> <p>Returns: Combined dataframe with the results of all queries</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def aggregate_multiple_queries_and_count(\n    queries, qa_dataset, config, group_cols=[\"id\", \"name\"], sort_by=\"query\", count=True\n) -&gt; pd.DataFrame:\n    \"\"\"\n    Description: Aggregate the results of multiple queries into a single dataframe and count the number of times a dataset appears in the results\n\n    Input:\n        queries: List of queries\n        group_cols: List of columns to group by\n\n    Returns: Combined dataframe with the results of all queries\n    \"\"\"\n    combined_df = pd.DataFrame()\n    for query in tqdm(queries, total=len(queries)):\n        result_data_frame, _ = get_result_from_query(\n            query=query, qa=qa_dataset, type_of_query=\"dataset\", config=config\n        )\n        result_data_frame = result_data_frame[group_cols]\n        # Concat with combined_df with a column to store the query\n        result_data_frame[\"query\"] = query\n        combined_df = pd.concat([combined_df, result_data_frame])\n    if count:\n        combined_df = (\n            combined_df.groupby(group_cols)\n            .count()\n            .reset_index()\n            .sort_values(by=sort_by, ascending=False)\n        )\n\n    return combined_df\n</code></pre>"},{"location":"modules/result_gen/#results_gen.check_query","title":"<code>check_query(query)</code>","text":"<p>Description: Performs checks on the query - Replaces %20 with space character (browsers do this automatically when spaces are in the URL) - Removes leading and trailing spaces - Limits the query to 150 characters</p> <p>Input: query (str)</p> <p>Returns: None</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def check_query(query: str) -&gt; str:\n    \"\"\"\n    Description: Performs checks on the query\n    - Replaces %20 with space character (browsers do this automatically when spaces are in the URL)\n    - Removes leading and trailing spaces\n    - Limits the query to 150 characters\n\n    Input: query (str)\n\n    Returns: None\n    \"\"\"\n    if query == \"\":\n        raise ValueError(\"Query cannot be empty.\")\n    query = query.replace(\n        \"%20\", \" \"\n    )  # replace %20 with space character (browsers do this automatically when spaces are in the URL)\n    # query = query.replace(\"dataset\", \"\")\n    # query = query.replace(\"flow\", \"\")\n    query = query.strip()\n    query = query[:200]\n    return query\n</code></pre>"},{"location":"modules/result_gen/#results_gen.create_output_dataframe","title":"<code>create_output_dataframe(dict_results, type_of_data, ids_order)</code>","text":"<p>Description: Create an output dataframe with the results. The URLs are API calls to the OpenML API for the specific type of data.</p> <p>Input: dict_results (dict), type_of_data (str)</p> <p>Returns: A dataframe with the results and duplicate names removed.</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def create_output_dataframe(\n    dict_results: dict, type_of_data: str, ids_order: list\n) -&gt; pd.DataFrame:\n    \"\"\"\n    Description: Create an output dataframe with the results. The URLs are API calls to the OpenML API for the specific type of data.\n\n    Input: dict_results (dict), type_of_data (str)\n\n    Returns: A dataframe with the results and duplicate names removed.\n    \"\"\"\n    output_df = pd.DataFrame(dict_results).T.reset_index()\n    # order the rows based on the order of the ids\n    output_df[\"index\"] = output_df[\"index\"].astype(int)\n    output_df = output_df.set_index(\"index\").loc[ids_order].reset_index()\n    # output_df[\"urls\"] = output_df[\"index\"].apply(\n    #     lambda x: f\"https://www.openml.org/api/v1/json/{type_of_data}/{x}\"\n    # )\n    # https://www.openml.org/search?type=data&amp;sort=runs&amp;status=any&amp;id=31\n    output_df[\"urls\"] = output_df[\"index\"].apply(\n        lambda x: f\"https://www.openml.org/search?type={type_of_data}&amp;id={x}\"\n    )\n    output_df[\"urls\"] = output_df[\"urls\"].apply(make_clickable)\n    # data = openml.datasets.get_dataset(\n    # get rows with unique names\n    if type_of_data == \"data\":\n        output_df[\"command\"] = output_df[\"index\"].apply(\n            lambda x: f\"dataset = openml.datasets.get_dataset({x})\"\n        )\n    elif type_of_data == \"flow\":\n        output_df[\"command\"] = output_df[\"index\"].apply(\n            lambda x: f\"flow = openml.flows.get_flow({x})\"\n        )\n    output_df = output_df.drop_duplicates(subset=[\"name\"])\n    # order the columns\n    output_df = output_df[[\"index\", \"name\", \"command\", \"urls\", \"page_content\"]].rename(\n        columns={\"index\": \"id\", \"urls\": \"OpenML URL\", \"page_content\": \"Description\"}\n    )\n    return output_df\n</code></pre>"},{"location":"modules/result_gen/#results_gen.fetch_results","title":"<code>fetch_results(query, qa, type_of_query, config)</code>","text":"<p>Description: Fetch results for the query using the QA chain.</p> <p>Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str), config (dict)</p> <p>Returns: results[\"source_documents\"] (list)</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def fetch_results(\n    query: str,\n    qa: langchain.chains.retrieval_qa.base.RetrievalQA,\n    type_of_query: str,\n    config: dict,\n) -&gt; Sequence[Document]:\n    \"\"\"\n    Description: Fetch results for the query using the QA chain.\n\n    Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str), config (dict)\n\n    Returns: results[\"source_documents\"] (list)\n    \"\"\"\n    results = qa.invoke(\n        input=query,\n        config={\"temperature\": config[\"temperature\"], \"top-p\": config[\"top_p\"]},\n    )\n    if config[\"long_context_reorder\"] == True:\n        results = long_context_reorder(results)\n    id_column = {\"dataset\": \"did\", \"flow\": \"id\", \"data\": \"did\"}\n    id_column = id_column[type_of_query]\n\n    if config[\"reranking\"] == True:\n        try:\n            print(\"[INFO] Reranking results...\")\n            ranker = Ranker(model_name=\"ms-marco-MiniLM-L-12-v2\", cache_dir=\"/tmp/\")\n            rerankrequest = RerankRequest(\n                query=query,\n                passages=[\n                    {\"id\": result.metadata[id_column], \"text\": result.page_content}\n                    for result in results\n                ],\n            )\n            ranking = ranker.rerank(rerankrequest)\n            ids = [result[\"id\"] for result in ranking]\n            ranked_results = [\n                result for result in results if result.metadata[id_column] in ids\n            ]\n            print(\"[INFO] Reranking complete.\")\n            return ranked_results\n        except Exception as e:\n            print(f\"[ERROR] Reranking failed: {e}\")\n            return results\n\n    else:\n        return results\n</code></pre>"},{"location":"modules/result_gen/#results_gen.get_result_from_query","title":"<code>get_result_from_query(query, qa, type_of_query, config)</code>","text":"<p>Description: Get the result from the query using the QA chain and return the results in a dataframe that is then sent to the frontend.</p> <p>Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str)</p> <p>Returns: output_df (pd.DataFrame)</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def get_result_from_query(\n    query, qa, type_of_query, config\n) -&gt; Tuple[pd.DataFrame, Sequence[Document]]:\n    \"\"\"\n    Description: Get the result from the query using the QA chain and return the results in a dataframe that is then sent to the frontend.\n\n    Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str)\n\n    Returns: output_df (pd.DataFrame)\n    \"\"\"\n    if type_of_query == \"dataset\":\n        # Fixing the key_name for dataset because of the way the OpenML API returns the data\n        type_of_query = \"data\"\n    elif type_of_query == \"flow\":\n        type_of_query = \"flow\"\n    else:\n        raise ValueError(f\"Unsupported type_of_data: {type_of_query}\")\n\n    # Process the query\n    query = check_query(query)\n    if query == \"\":\n        return pd.DataFrame(), []\n    source_documents = fetch_results(\n        query, qa, config=config, type_of_query=type_of_query\n    )\n    dict_results, ids_order = process_documents(source_documents)\n    output_df = create_output_dataframe(dict_results, type_of_query, ids_order)\n\n    return output_df, ids_order\n</code></pre>"},{"location":"modules/result_gen/#results_gen.long_context_reorder","title":"<code>long_context_reorder(results)</code>","text":"<p>Description: Lost in the middle reorder: the less relevant documents will be at the middle of the list and more relevant elements at beginning / end. See: https://arxiv.org/abs//2307.03172</p> <p>Input: results (list)</p> <p>Returns: reorder results (list)</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def long_context_reorder(results: Sequence[Document]) -&gt; Sequence[Document]:\n    \"\"\"\n    Description: Lost in the middle reorder: the less relevant documents will be at the\n    middle of the list and more relevant elements at beginning / end.\n    See: https://arxiv.org/abs//2307.03172\n\n    Input: results (list)\n\n    Returns: reorder results (list)\n    \"\"\"\n    print(\"[INFO] Reordering results...\")\n    reordering = LongContextReorder()\n    results = reordering.transform_documents(results)\n    print(\"[INFO] Reordering complete.\")\n    return results\n</code></pre>"},{"location":"modules/result_gen/#results_gen.make_clickable","title":"<code>make_clickable(val)</code>","text":"<p>Description: Make the URL clickable in the dataframe.</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def make_clickable(val: str) -&gt; str:\n    \"\"\"\n    Description: Make the URL clickable in the dataframe.\n    \"\"\"\n    return '&lt;a href=\"{}\"&gt;{}&lt;/a&gt;'.format(val, val)\n</code></pre>"},{"location":"modules/result_gen/#results_gen.process_documents","title":"<code>process_documents(source_documents)</code>","text":"<p>Description: Process the source documents and create a dictionary with the key_name as the key and the name and page content as the values.</p> <p>Input: source_documents (list), key_name (str)</p> <p>Returns: dict_results (dict)</p> Source code in <code>backend/modules/results_gen.py</code> <pre><code>def process_documents(source_documents: Sequence[Document]) -&gt; Tuple[OrderedDict, list]:\n    \"\"\"\n    Description: Process the source documents and create a dictionary with the key_name as the key and the name and page content as the values.\n\n    Input: source_documents (list), key_name (str)\n\n    Returns: dict_results (dict)\n    \"\"\"\n    dict_results = OrderedDict()\n    for result in source_documents:\n        dict_results[result.metadata[\"did\"]] = {\n            \"name\": result.metadata[\"name\"],\n            \"page_content\": result.page_content,\n        }\n    ids = [result.metadata[\"did\"] for result in source_documents]\n    return dict_results, ids\n</code></pre>"}]}
\ No newline at end of file
diff --git a/site/sitemap.xml.gz b/site/sitemap.xml.gz
index b01907d44b497e68da85671079eecc0b503d818d..fcaa9f27128dd65a346306fa6f4bdd820c975fac 100644
GIT binary patch
delta 13
Ucmb=gXP58h;CR5_J(0Zv03AXEN&o-=

delta 13
Ucmb=gXP58h;CP_cJdwQu03DSCSpWb4

diff --git a/site/testing/index.html b/site/testing/index.html
index 9e4f959..43fcad7 100644
--- a/site/testing/index.html
+++ b/site/testing/index.html
@@ -9,7 +9,7 @@
       
       
       
-        <link rel="prev" href="../inference/">
+        <link rel="prev" href="../query_llm/">
       
       
         <link rel="next" href="../training/">
@@ -276,6 +276,26 @@
       
   
   
+  
+  
+    <li class="md-nav__item">
+      <a href="../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
     
   
   
@@ -438,10 +458,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -452,8 +472,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -627,10 +647,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -641,8 +661,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>
@@ -882,9 +902,7 @@ <h2 id="all-tests">All tests<a class="headerlink" href="#all-tests" title="Perma
 
               <details class="quote">
                 <summary>Source code in <code>tests/unit_testing.py</code></summary>
-                <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">15</span>
-<span class="normal">16</span>
-<span class="normal">17</span>
+                <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">17</span>
 <span class="normal">18</span>
 <span class="normal">19</span>
 <span class="normal">20</span>
@@ -924,16 +942,56 @@ <h2 id="all-tests">All tests<a class="headerlink" href="#all-tests" title="Perma
 <span class="normal">54</span>
 <span class="normal">55</span>
 <span class="normal">56</span>
-<span class="normal">57</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">class</span> <span class="nc">TestConfig</span><span class="p">(</span><span class="n">unittest</span><span class="o">.</span><span class="n">TestCase</span><span class="p">):</span>
+<span class="normal">57</span>
+<span class="normal">58</span>
+<span class="normal">59</span>
+<span class="normal">60</span>
+<span class="normal">61</span>
+<span class="normal">62</span>
+<span class="normal">63</span>
+<span class="normal">64</span>
+<span class="normal">65</span>
+<span class="normal">66</span>
+<span class="normal">67</span>
+<span class="normal">68</span>
+<span class="normal">69</span>
+<span class="normal">70</span>
+<span class="normal">71</span>
+<span class="normal">72</span>
+<span class="normal">73</span>
+<span class="normal">74</span>
+<span class="normal">75</span>
+<span class="normal">76</span>
+<span class="normal">77</span>
+<span class="normal">78</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">class</span> <span class="nc">TestConfig</span><span class="p">(</span><span class="n">unittest</span><span class="o">.</span><span class="n">TestCase</span><span class="p">):</span>
     <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
         <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">client</span> <span class="o">=</span> <span class="n">chromadb</span><span class="o">.</span><span class="n">PersistentClient</span><span class="p">(</span><span class="n">path</span><span class="o">=</span><span class="n">config</span><span class="p">[</span><span class="s2">&quot;persist_dir&quot;</span><span class="p">])</span>
-        <span class="bp">self</span><span class="o">.</span><span class="n">config_keys</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;rqa_prompt_template&quot;</span><span class="p">,</span> <span class="s2">&quot;llm_prompt_template&quot;</span><span class="p">,</span>
-        <span class="s2">&quot;num_return_documents&quot;</span><span class="p">,</span> <span class="s2">&quot;embedding_model&quot;</span><span class="p">,</span> <span class="s2">&quot;llm_model&quot;</span><span class="p">,</span> <span class="s2">&quot;num_documents_for_llm&quot;</span><span class="p">,</span> <span class="s2">&quot;data_dir&quot;</span><span class="p">,</span> <span class="s2">&quot;persist_dir&quot;</span><span class="p">,</span> <span class="s2">&quot;testing_flag&quot;</span><span class="p">,</span> <span class="s2">&quot;ignore_downloading_data&quot;</span><span class="p">,</span> <span class="s2">&quot;test_subset_2000&quot;</span><span class="p">,</span> <span class="s2">&quot;data_download_n_jobs&quot;</span><span class="p">,</span> <span class="s2">&quot;training&quot;</span><span class="p">,</span> <span class="s2">&quot;temperature&quot;</span><span class="p">,</span> <span class="s2">&quot;top_p&quot;</span><span class="p">,</span> <span class="s2">&quot;search_type&quot;</span><span class="p">,</span> <span class="s2">&quot;reranking&quot;</span><span class="p">,</span> <span class="s2">&quot;long_context_reorder&quot;</span><span class="p">]</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">config_keys</span> <span class="o">=</span> <span class="p">[</span>
+            <span class="s2">&quot;rqa_prompt_template&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;llm_prompt_template&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;num_return_documents&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;embedding_model&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;llm_model&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;num_documents_for_llm&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;data_dir&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;persist_dir&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;testing_flag&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;ignore_downloading_data&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;test_subset_2000&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;data_download_n_jobs&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;training&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;temperature&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;top_p&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;search_type&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;reranking&quot;</span><span class="p">,</span>
+            <span class="s2">&quot;long_context_reorder&quot;</span><span class="p">,</span>
+        <span class="p">]</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">query_test_dict</span> <span class="o">=</span> <span class="p">{</span>
             <span class="s2">&quot;dataset&quot;</span><span class="p">:</span> <span class="s2">&quot;Find me a dataset about flowers that has a high number of instances.&quot;</span><span class="p">,</span>
             <span class="s2">&quot;flow&quot;</span><span class="p">:</span> <span class="s2">&quot;Find me a flow that uses the RandomForestClassifier.&quot;</span><span class="p">,</span>
         <span class="p">}</span>
+
     <span class="k">def</span> <span class="nf">test_check_data_dirs</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
 <span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">        Description: Check if the data directory exists.</span>
@@ -998,13 +1056,13 @@ <h2 id="unit_testing.TestConfig.test_check_data_dirs" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>tests/unit_testing.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">25</span>
-<span class="normal">26</span>
-<span class="normal">27</span>
-<span class="normal">28</span>
-<span class="normal">29</span>
-<span class="normal">30</span>
-<span class="normal">31</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">test_check_data_dirs</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">46</span>
+<span class="normal">47</span>
+<span class="normal">48</span>
+<span class="normal">49</span>
+<span class="normal">50</span>
+<span class="normal">51</span>
+<span class="normal">52</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">test_check_data_dirs</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Check if the data directory exists.</span>
 <span class="sd">    Returns: None</span>
@@ -1033,13 +1091,13 @@ <h2 id="unit_testing.TestConfig.test_config" class="doc doc-heading">
 
             <details class="quote">
               <summary>Source code in <code>tests/unit_testing.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">33</span>
-<span class="normal">34</span>
-<span class="normal">35</span>
-<span class="normal">36</span>
-<span class="normal">37</span>
-<span class="normal">38</span>
-<span class="normal">39</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">test_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">54</span>
+<span class="normal">55</span>
+<span class="normal">56</span>
+<span class="normal">57</span>
+<span class="normal">58</span>
+<span class="normal">59</span>
+<span class="normal">60</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">test_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Check if the config has the required keys.</span>
 <span class="sd">    Returns: None</span>
@@ -1068,23 +1126,23 @@ <h2 id="unit_testing.TestConfig.test_setup_vector_db_and_qa" class="doc doc-head
 
             <details class="quote">
               <summary>Source code in <code>tests/unit_testing.py</code></summary>
-              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">41</span>
-<span class="normal">42</span>
-<span class="normal">43</span>
-<span class="normal">44</span>
-<span class="normal">45</span>
-<span class="normal">46</span>
-<span class="normal">47</span>
-<span class="normal">48</span>
-<span class="normal">49</span>
-<span class="normal">50</span>
-<span class="normal">51</span>
-<span class="normal">52</span>
-<span class="normal">53</span>
-<span class="normal">54</span>
-<span class="normal">55</span>
-<span class="normal">56</span>
-<span class="normal">57</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">test_setup_vector_db_and_qa</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">62</span>
+<span class="normal">63</span>
+<span class="normal">64</span>
+<span class="normal">65</span>
+<span class="normal">66</span>
+<span class="normal">67</span>
+<span class="normal">68</span>
+<span class="normal">69</span>
+<span class="normal">70</span>
+<span class="normal">71</span>
+<span class="normal">72</span>
+<span class="normal">73</span>
+<span class="normal">74</span>
+<span class="normal">75</span>
+<span class="normal">76</span>
+<span class="normal">77</span>
+<span class="normal">78</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">test_setup_vector_db_and_qa</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Description: Check if the setup_vector_db_and_qa function works as expected.</span>
 <span class="sd">    Returns: None</span>
diff --git a/site/training/index.html b/site/training/index.html
index 8314a6f..9a093b8 100644
--- a/site/training/index.html
+++ b/site/training/index.html
@@ -278,6 +278,26 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../query_llm/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    LLM Query parsing
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../testing/" class="md-nav__link">
         
@@ -375,10 +395,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
         
           
-          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -389,8 +409,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_7">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
             <span class="md-nav__icon md-icon"></span>
             Developer tutorials
           </label>
@@ -564,10 +584,10 @@
       
         
         
-        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_9" >
         
           
-          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+          <label class="md-nav__link" for="__nav_9" id="__nav_9_label" tabindex="0">
             
   
   <span class="md-ellipsis">
@@ -578,8 +598,8 @@
             <span class="md-nav__icon md-icon"></span>
           </label>
         
-        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
-          <label class="md-nav__title" for="__nav_8">
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_9_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_9">
             <span class="md-nav__icon md-icon"></span>
             Modules
           </label>