From 8e89e52a9a1931b1a5172c3961b14838be0c1d7a Mon Sep 17 00:00:00 2001 From: SubhadityaMukherjee Date: Mon, 8 Jul 2024 16:56:36 +0200 Subject: [PATCH] fixed some bugs here and there --- docs/inference.md | 4 +- docs/query_llm.md | 4 + frontend/ui.py | 77 +- frontend/ui_utils.py | 184 ++ frontend/utils.py | 60 - llm_service/llm_service.py | 57 +- site/404.html | 36 +- site/configuration/index.html | 36 +- .../change model/index.html | 36 +- .../create vectordb/index.html | 36 +- .../get an llm summary/index.html | 36 +- site/developer tutorials/index.html | 36 +- .../load vectordb and get results/index.html | 36 +- .../index.html | 385 +--- .../train and evaluate models/index.html | 622 +---- site/docker/index.html | 36 +- site/index.html | 36 +- site/inference/index.html | 772 ++++++- site/modules/general_utils/index.html | 60 +- site/modules/llm_module/index.html | 220 +- site/modules/metadata_module/index.html | 2002 ++++++++++------- site/modules/result_gen/index.html | 214 +- site/objects.inv | Bin 689 -> 892 bytes site/query_llm/index.html | 1001 +++++++++ site/search/search_index.json | 2 +- site/sitemap.xml.gz | Bin 127 -> 127 bytes site/testing/index.html | 150 +- site/training/index.html | 36 +- 28 files changed, 3935 insertions(+), 2239 deletions(-) create mode 100644 docs/query_llm.md create mode 100644 frontend/ui_utils.py delete mode 100644 frontend/utils.py create mode 100644 site/query_llm/index.html diff --git a/docs/inference.md b/docs/inference.md index 50e0a71..85ccb65 100644 --- a/docs/inference.md +++ b/docs/inference.md @@ -9,4 +9,6 @@ ## Errors -- If you get an error about file permissions, run `chmod +x start_local.sh` and `chmod +x stop_local.sh` to make them executable. \ No newline at end of file +- If you get an error about file permissions, run `chmod +x start_local.sh` and `chmod +x stop_local.sh` to make them executable. + +:::ui_utils \ No newline at end of file diff --git a/docs/query_llm.md b/docs/query_llm.md new file mode 100644 index 0000000..b3c3bd1 --- /dev/null +++ b/docs/query_llm.md @@ -0,0 +1,4 @@ +# LLM Query parsing + +- The LLM reads the query and parses it into a list of filters based on a prompt +:::llm_service diff --git a/frontend/ui.py b/frontend/ui.py index 2d585d4..65917ab 100644 --- a/frontend/ui.py +++ b/frontend/ui.py @@ -2,11 +2,9 @@ from pathlib import Path import pandas as pd -import requests import streamlit as st -from streamlit_feedback import streamlit_feedback -from utils import (feedback_cb, filter_initial_response, parse_llm_response, - update_subset_cols) +from ui_utils import run_streamlit + with open("../backend/config.json", "r") as file: config = json.load(file) @@ -19,6 +17,7 @@ data_metadata = pd.read_csv(data_metadata) flow_metadata = pd.read_csv(flow_metadata) + # Main Streamlit App st.title("OpenML AI Search") @@ -28,72 +27,4 @@ st.session_state["query"] = query response = {"initial_response": None} - -if st.button("Submit"): - with st.spinner("Waiting for results..."): - try: - response = requests.get( - f"http://fastapi:8000/{query_type.lower()}/{query}", - json={"query": query, "type": query_type.lower()}, - ).json() - except: - response = requests.get( - f"http://0.0.0.0:8000/{query_type.lower()}/{query}", - json={"query": query, "type": query_type.lower()}, - ).json() - - if response["initial_response"] is not None: - st.write("Results:") - - # response is the ids, we need to get the metdata from the ids - if query_type == "Dataset": - initial_response = data_metadata[ - data_metadata["did"].isin(response["initial_response"]) - ] - # subset_cols = ["did", "name","OpenML URL","Description", "command"] - else: - initial_response = flow_metadata[ - flow_metadata["id"].isin(response["initial_response"]) - ] - - # def process query using results from port 8001/llmquery/{query} - with st.spinner("Using an LLM to find the most relevent information..."): - if query_type == "Dataset": - try: - llm_response = requests.get( - f"http://fastapi:8081/llmquery/{query}" - ).json() - except: - llm_response = requests.get( - f"http://0.0.0.0:8081/llmquery/{query}" - ).json() - - subset_cols = ["did", "name"] - try: - ( - dataset_size, - dataset_missing, - dataset_classification, - dataset_sort, - ) = parse_llm_response(llm_response) - subset_cols = update_subset_cols( - dataset_size, dataset_missing, dataset_classification - ) - initial_response = filter_initial_response( - initial_response, dataset_classification - ) - except Exception as e: - st.error(f"Error processing LLM response: {e}") - - initial_response = initial_response[subset_cols] - - st.dataframe(initial_response) - - with st.form("fb_form"): - streamlit_feedback( - feedback_type="thumbs", - align="flex-start", - key="fb_k", - optional_text_label="[Optional] Please provide an explanation", - ) - st.form_submit_button("Save feedback", on_click=feedback_cb) +run_streamlit() \ No newline at end of file diff --git a/frontend/ui_utils.py b/frontend/ui_utils.py new file mode 100644 index 0000000..015666e --- /dev/null +++ b/frontend/ui_utils.py @@ -0,0 +1,184 @@ +import json +import os + +from streamlit import session_state as ss +from streamlit_feedback import streamlit_feedback +import requests +import streamlit as st + +def feedback_cb(): + """ + Description: Callback function to save feedback to a file + + Input: None + + Returns: None + """ + file_path = "feedback.json" + + if os.path.exists(file_path): + with open(file_path, "r") as file: + try: + data = json.load(file) + except json.JSONDecodeError: + data = [] + else: + data = [] + + # Append new feedback + data.append({"ss": ss.fb_k, "query": ss.query}) + + # Write updated content back to the file + with open(file_path, "w") as file: + json.dump(data, file, indent=4) + + +def parse_llm_response(response): + """ + Description: Parse the answers from the LLM response + + Input: response (dict) + + Returns: size (str), missing (str), classification (str), sort (str) + """ + size, missing, classification = response["answers"] + size, sort = size.split(",") if "," in size else (size, None) + return size, missing, classification, sort + + +def update_subset_cols(size, missing, classification): + """ + Description: Update the subset columns based on LLM's response + + Input: size (str), missing (str), classification (str) + + Returns: cols (list) + """ + cols = ["did", "name"] + if size == "yes": + cols.append("NumberOfInstances") + if missing == "yes": + cols.append("NumberOfMissingValues") + if classification != "none": + cols.append("NumberOfClasses") + return cols + + +def filter_initial_response(response, classification): + """ + Description: Filter the initial response based on the classification + + Input: response (DataFrame), classification (str) + + Returns: response (DataFrame) + """ + if classification != "none": + if "multi" in classification: + response = response[response["NumberOfClasses"] > 2] + elif "binary" in classification: + response = response[response["NumberOfClasses"] == 2] + return response + + +def fetch_response(query_type, query): + """ + Description: Fetch the response from the FastAPI service + + Input: query_type (str), query (str) + + Returns: response (dict) + """ + try: + response = requests.get( + f"http://fastapi:8000/{query_type.lower()}/{query}", + json={"query": query, "type": query_type.lower()}, + ).json() + except: + response = requests.get( + f"http://0.0.0.0:8000/{query_type.lower()}/{query}", + json={"query": query, "type": query_type.lower()}, + ).json() + return response + +def fetch_llm_response(query): + """ + Description: Fetch the response from the LLM service + + Input: query (str) + + Returns: llm_response (dict) + """ + try: + llm_response = requests.get(f"http://fastapi:8081/llmquery/{query}").json() + except: + llm_response = requests.get(f"http://0.0.0.0:8081/llmquery/{query}").json() + return llm_response + +def parse_and_update_response(query_type, response, llm_response, data_metadata, flow_metadata): + """ + Description: Parse and update the response based on the query type + + Input: query_type (str), response (dict), llm_response (dict), data_metadata (DataFrame), flow_metadata (DataFrame) + + Returns: initial_response (DataFrame) + """ + if query_type == "Dataset": + initial_response = data_metadata[data_metadata["did"].isin(response["initial_response"])] + subset_cols = ["did", "name"] + try: + dataset_size, dataset_missing, dataset_classification, dataset_sort = parse_llm_response(llm_response) + subset_cols = update_subset_cols(dataset_size, dataset_missing, dataset_classification) + initial_response = filter_initial_response(initial_response, dataset_classification) + except Exception as e: + st.error(f"Error processing LLM response: {e}") + initial_response = initial_response[subset_cols] + else: + initial_response = flow_metadata[flow_metadata["id"].isin(response["initial_response"])] + return initial_response + +def display_results(initial_response): + """ + Description: Display the results in a DataFrame + + Input: initial_response (DataFrame) + + Returns: None + """ + st.write("Results:") + st.dataframe(initial_response) + +def run_streamlit(): + """ + Description: Run the Streamlit app + + Input: None + + Returns: None + """ + if st.button("Submit"): + with st.spinner("Waiting for results..."): + query_type = st.session_state['query_type'] + query = st.session_state['query'] + data_metadata = st.session_state['data_metadata'] + flow_metadata = st.session_state['flow_metadata'] + + response = fetch_response(query_type, query) + + if response["initial_response"] is not None: + if query_type == "Dataset": + with st.spinner("Using an LLM to find the most relevant information..."): + llm_response = fetch_llm_response(query) + initial_response = parse_and_update_response(query_type, response, llm_response, data_metadata, flow_metadata) + else: + initial_response = parse_and_update_response(query_type, response, None, data_metadata, flow_metadata) + + display_results(initial_response) + + with st.form("fb_form"): + streamlit_feedback( + feedback_type="thumbs", + align="flex-start", + key="fb_k", + optional_text_label="[Optional] Please provide an explanation", + ) + st.form_submit_button("Save feedback", on_click=feedback_cb) \ No newline at end of file diff --git a/frontend/utils.py b/frontend/utils.py deleted file mode 100644 index a8bdc49..0000000 --- a/frontend/utils.py +++ /dev/null @@ -1,60 +0,0 @@ -import json -import os - -from streamlit import session_state as ss - - -def feedback_cb(): - """ - Description: Callback function to save feedback to a file - - Input: None - - Returns: None - """ - file_path = "feedback.json" - - if os.path.exists(file_path): - with open(file_path, "r") as file: - try: - data = json.load(file) - except json.JSONDecodeError: - data = [] - else: - data = [] - - # Append new feedback - data.append({"ss": ss.fb_k, "query": ss.query}) - - # Write updated content back to the file - with open(file_path, "w") as file: - json.dump(data, file, indent=4) - - -def parse_llm_response(response): - # Define the function to parse the llm_response - size, missing, classification = response["answers"] - size, sort = size.split(",") if "," in size else (size, None) - return size, missing, classification, sort - - -def update_subset_cols(size, missing, classification): - # Define the function to update the subset columns - cols = ["did", "name"] - if size == "yes": - cols.append("NumberOfInstances") - if missing == "yes": - cols.append("NumberOfMissingValues") - if classification != "none": - cols.append("NumberOfClasses") - return cols - - -def filter_initial_response(response, classification): - # Define the function to filter the initial response based on classification - if classification != "none": - if "multi" in classification: - response = response[response["NumberOfClasses"] > 2] - elif "binary" in classification: - response = response[response["NumberOfClasses"] == 2] - return response diff --git a/llm_service/llm_service.py b/llm_service/llm_service.py index 5881eec..48e895a 100644 --- a/llm_service/llm_service.py +++ b/llm_service/llm_service.py @@ -2,7 +2,7 @@ from fastapi import FastAPI from fastapi.responses import JSONResponse from httpx import ConnectTimeout -from langchain_community.chat_models import ChatOllama +from langchain_community.chat_models.ollama import ChatOllama from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate from tenacity import retry, retry_if_exception_type, stop_after_attempt @@ -16,37 +16,51 @@ def create_chain(prompt, model="llama3", temperature=0): + """ + Description: Create a chain with the given prompt and model + + Input: prompt (str), model (str), temperature (float) + + Returns: chain (Chain) + """ llm = ChatOllama(model=model, temperature=temperature) prompt = ChatPromptTemplate.from_template(prompt) - # using LangChain Expressive Language chain syntax - # learn more about the LCEL on - # /docs/concepts/#langchain-expression-language-lcel return prompt | llm | StrOutputParser() def parse_answers_initial(response): - # for each line in the response, split by ? and check if the response is Yes/No or a comma separated string of Yes/No or ascending/descending using regex + """ + Description: Parse the answers from the initial response + + Input: response (str) + + Returns: answers (list) + """ + patterns = [ + r"^(yes|no|none)", + r"^(ascending|descending)", + r"(multi-class|binary|multi-label)" + ] + answers = [] - for line in response.lower().split("\n"): + lines = response.lower().split("\n") + + for line in lines: if "?" in line: - response = line.split("?")[1].strip() - if response in ["yes", "no", "none"]: - answers.append(response) - # elif re.match(r"^(Yes|No),\s?(Yes|No)$", response): - # match for Yes/No or ascending/descending and full stop - elif re.match(r"^(yes|no)", response): - answers.append(response) - elif re.match(r"^(ascending|descending)", response): - answers.append(response) - elif re.match(r"(multi-class|binary|multi-label)", response): - answers.append(response) - # if any of the words are in the line, append the line to the answers - elif any(word in line for word in ["yes", "no", "none", "ascending", "descending", "multi-class", "binary", "multi-label"]): - answers.append(line.strip()) + # Extract the part of the line after the question mark + potential_answer = line.split("?")[1].strip() + else: + potential_answer = line.strip() + + # Check if the potential answer matches any of the patterns + for pattern in patterns: + if re.match(pattern, potential_answer): + answers.append(potential_answer) + break # Stop checking other patterns if a match is found + return answers - chain = create_chain(prompt) app = FastAPI() @@ -58,6 +72,5 @@ def parse_answers_initial(response): async def get_llm_query(query: str): query = query.replace("%20", " ") response = chain.invoke({"query": query}) - print(response) answers = parse_answers_initial(response) return JSONResponse(content={"answers": answers}) diff --git a/site/404.html b/site/404.html index a543db0..dadbfa9 100644 --- a/site/404.html +++ b/site/404.html @@ -269,6 +269,26 @@ +
  • + + + + + LLM Query parsing + + + + +
  • + + + + + + + + +
  • @@ -317,10 +337,10 @@ - + -