From b920b24ed5cc61dae69c93d799c806f0dc76de8d Mon Sep 17 00:00:00 2001 From: alisalim17 Date: Sat, 13 Apr 2024 18:03:29 +0400 Subject: [PATCH] refactor: superrag service --- .../api/api_datasource_superrag_manager.py | 4 +- .../api/workflow_configs/data_transformer.py | 43 ++------- .../app/api/workflow_configs/exceptions.py | 6 ++ .../api/workflow_configs/workflow_configs.py | 2 +- libs/superagent/services/superrag.py | 88 ++++++++++++++++--- 5 files changed, 91 insertions(+), 52 deletions(-) create mode 100644 libs/superagent/app/api/workflow_configs/exceptions.py diff --git a/libs/superagent/app/api/workflow_configs/api/api_datasource_superrag_manager.py b/libs/superagent/app/api/workflow_configs/api/api_datasource_superrag_manager.py index 9c416756f..4f50951e0 100644 --- a/libs/superagent/app/api/workflow_configs/api/api_datasource_superrag_manager.py +++ b/libs/superagent/app/api/workflow_configs/api/api_datasource_superrag_manager.py @@ -143,7 +143,7 @@ async def add_datasource(self, assistant: dict, data: dict): data["index_name"] = await self._get_unique_index_name(data, assistant) await self._add_superrag_tool(assistant, data) - await self.superrag_service.aingest(data=data) + self.superrag_service.ingest(data=data) async def delete_datasource(self, assistant: dict, datasource: dict): tool = await self.agent_manager.get_tool( @@ -156,7 +156,7 @@ async def delete_datasource(self, assistant: dict, datasource: dict): tool_metadata = json.loads(tool.metadata) await self._delete_tool(assistant, datasource) - await self.superrag_service.adelete( + self.superrag_service.delete( { **datasource, "index_name": tool_metadata.get("index_name"), diff --git a/libs/superagent/app/api/workflow_configs/data_transformer.py b/libs/superagent/app/api/workflow_configs/data_transformer.py index be18cfc54..30158d17f 100644 --- a/libs/superagent/app/api/workflow_configs/data_transformer.py +++ b/libs/superagent/app/api/workflow_configs/data_transformer.py @@ -2,10 +2,10 @@ import logging from app.api.workflow_configs.api.api_manager import ApiManager +from app.api.workflow_configs.exceptions import MissingVectorDatabaseProvider from app.api.workflow_configs.saml_schema import SAML_OSS_LLM_PROVIDERS from app.utils.helpers import ( get_first_non_null_key, - get_mimetype_from_url, get_superrag_compatible_credentials, remove_key_if_present, rename_and_remove_keys, @@ -13,6 +13,7 @@ from app.utils.llm import LLM_REVERSE_MAPPING, get_llm_provider from app.vectorstores.base import REVERSE_VECTOR_DB_MAPPING from prisma.enums import AgentType, ToolType +from services.superrag import File logger = logging.getLogger(__name__) @@ -24,24 +25,6 @@ } -SUPERRAG_MIME_TYPE_TO_EXTENSION = { - "text/plain": "TXT", - "text/markdown": "MARKDOWN", - "application/pdf": "PDF", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "DOCX", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "XLSX", - "text/csv": "CSV", -} - - -class MissingVectorDatabaseProvider(Exception): - pass - - -class UnkownFileType(Exception): - pass - - # Source https://stackoverflow.com/questions/33797126/proper-way-to-remove-keys-in-dictionary-with-none-values-in-python def delete_none_values(_dict): """Delete None values recursively from all of the dictionaries""" @@ -198,7 +181,7 @@ async def _set_database_provider(self, datasource: dict): } else: raise MissingVectorDatabaseProvider( - "Vector database provider not found." + "Vector database provider not found. " "Please configure it by going to the integrations page" ) remove_key_if_present(datasource, "database_provider") @@ -208,27 +191,13 @@ async def _set_superrag_files(self, datasource: dict): files = [] for url in urls: - file_type = self._get_file_type(url) + file = File(url=url) files.append( { - "type": file_type, - "url": url, + "type": file.type.value, + "url": file.url, } ) datasource["files"] = files remove_key_if_present(datasource, "urls") - - def _get_file_type(self, url: str): - try: - file_type = SUPERRAG_MIME_TYPE_TO_EXTENSION[get_mimetype_from_url(url)] - except KeyError: - supported_file_types = ", ".join( - value for value in SUPERRAG_MIME_TYPE_TO_EXTENSION.values() - ) - raise UnkownFileType( - f"Unknown file type for URL {url}" - f"Supported file types are: {supported_file_types}" - ) - - return file_type diff --git a/libs/superagent/app/api/workflow_configs/exceptions.py b/libs/superagent/app/api/workflow_configs/exceptions.py new file mode 100644 index 000000000..cd918b5cb --- /dev/null +++ b/libs/superagent/app/api/workflow_configs/exceptions.py @@ -0,0 +1,6 @@ +class MissingVectorDatabaseProvider(Exception): + pass + + +class UnkownFileType(Exception): + pass diff --git a/libs/superagent/app/api/workflow_configs/workflow_configs.py b/libs/superagent/app/api/workflow_configs/workflow_configs.py index c2ef03d3d..5582fb425 100644 --- a/libs/superagent/app/api/workflow_configs/workflow_configs.py +++ b/libs/superagent/app/api/workflow_configs/workflow_configs.py @@ -10,7 +10,7 @@ from app.api.workflow_configs.api.api_agent_manager import ApiAgentManager from app.api.workflow_configs.api.api_manager import ApiManager -from app.api.workflow_configs.data_transformer import ( +from app.api.workflow_configs.exceptions import ( MissingVectorDatabaseProvider, UnkownFileType, ) diff --git a/libs/superagent/services/superrag.py b/libs/superagent/services/superrag.py index f2e0b8941..338b5a50f 100644 --- a/libs/superagent/services/superrag.py +++ b/libs/superagent/services/superrag.py @@ -1,8 +1,75 @@ +from enum import Enum from typing import Optional +from urllib.parse import unquote, urlparse -import aiohttp import requests from decouple import config +from pydantic import BaseModel + +from app.api.workflow_configs.exceptions import UnkownFileType + + +# Source https://github.com/superagent-ai/super-rag/blob/bb1630be981b075d8be0585adfc2e697105f510f/models/file.py +class FileType(Enum): + pdf = "PDF" + docx = "DOCX" + txt = "TXT" + pptx = "PPTX" + md = "MARKDOWN" + csv = "CSV" + xlsx = "XLSX" + html = "HTML" + json = "JSON" + eml = "EML" + msg = "MSG" + + def suffix(self) -> str: + suffixes = { + "TXT": ".txt", + "PDF": ".pdf", + "MARKDOWN": ".md", + "DOCX": ".docx", + "CSV": ".csv", + "XLSX": ".xlsx", + "PPTX": ".pptx", + "HTML": ".html", + "JSON": ".json", + "EML": ".eml", + "MSG": ".msg", + } + return suffixes[self.value] + + +class File(BaseModel): + url: str + name: str | None = None + + @property + def type(self) -> FileType | None: + url = self.url + if url: + parsed_url = urlparse(url) + path = unquote(parsed_url.path) + extension = path.split(".")[-1].lower() + try: + return FileType[extension] + except KeyError: + supported_file_types = ", ".join( + [file_type.value for file_type in FileType] + ) + raise UnkownFileType( + f"Unknown file type for URL {url}. " + f"Supported file types are: {supported_file_types}" + ) + return None + + @property + def suffix(self) -> str: + file_type = self.type + if file_type is not None: + return file_type.suffix() + else: + raise ValueError("File type is undefined, cannot determine suffix.") class SuperRagService: @@ -12,25 +79,22 @@ def __init__(self, url: Optional[str] = None): if not self.url: raise ValueError("SUPERRAG_API_URL is not set") - async def _arequest(self, method, endpoint, data): - async with aiohttp.ClientSession() as session: - async with session.request( - method, f"{self.url}/{endpoint}", json=data - ) as response: - return await response.json() - def _request(self, method, endpoint, data): return requests.request(method, f"{self.url}/{endpoint}", json=data).json() - async def aingest(self, data): - return await self._arequest( + def ingest(self, data): + return self._request( "POST", "ingest", data, ) - async def adelete(self, data): - return await self._arequest("DELETE", "delete", data) + def delete(self, data): + return self._request( + "DELETE", + "delete", + data, + ) def query(self, data): return self._request(