Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor: SuperRag #960

Merged
merged 1 commit into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ async def add_datasource(self, assistant: dict, data: dict):
data["index_name"] = await self._get_unique_index_name(data, assistant)

await self._add_superrag_tool(assistant, data)
await self.superrag_service.aingest(data=data)
self.superrag_service.ingest(data=data)

async def delete_datasource(self, assistant: dict, datasource: dict):
tool = await self.agent_manager.get_tool(
Expand All @@ -156,7 +156,7 @@ async def delete_datasource(self, assistant: dict, datasource: dict):
tool_metadata = json.loads(tool.metadata)

await self._delete_tool(assistant, datasource)
await self.superrag_service.adelete(
self.superrag_service.delete(
{
**datasource,
"index_name": tool_metadata.get("index_name"),
Expand Down
43 changes: 6 additions & 37 deletions libs/superagent/app/api/workflow_configs/data_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@
import logging

from app.api.workflow_configs.api.api_manager import ApiManager
from app.api.workflow_configs.exceptions import MissingVectorDatabaseProvider
from app.api.workflow_configs.saml_schema import SAML_OSS_LLM_PROVIDERS
from app.utils.helpers import (
get_first_non_null_key,
get_mimetype_from_url,
get_superrag_compatible_credentials,
remove_key_if_present,
rename_and_remove_keys,
)
from app.utils.llm import LLM_REVERSE_MAPPING, get_llm_provider
from app.vectorstores.base import REVERSE_VECTOR_DB_MAPPING
from prisma.enums import AgentType, ToolType
from services.superrag import File

logger = logging.getLogger(__name__)

Expand All @@ -24,24 +25,6 @@
}


SUPERRAG_MIME_TYPE_TO_EXTENSION = {
"text/plain": "TXT",
"text/markdown": "MARKDOWN",
"application/pdf": "PDF",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "DOCX",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "XLSX",
"text/csv": "CSV",
}


class MissingVectorDatabaseProvider(Exception):
pass


class UnkownFileType(Exception):
pass


# Source https://stackoverflow.com/questions/33797126/proper-way-to-remove-keys-in-dictionary-with-none-values-in-python
def delete_none_values(_dict):
"""Delete None values recursively from all of the dictionaries"""
Expand Down Expand Up @@ -198,7 +181,7 @@ async def _set_database_provider(self, datasource: dict):
}
else:
raise MissingVectorDatabaseProvider(
"Vector database provider not found."
"Vector database provider not found. "
"Please configure it by going to the integrations page"
)
remove_key_if_present(datasource, "database_provider")
Expand All @@ -208,27 +191,13 @@ async def _set_superrag_files(self, datasource: dict):
files = []

for url in urls:
file_type = self._get_file_type(url)
file = File(url=url)
files.append(
{
"type": file_type,
"url": url,
"type": file.type.value,
"url": file.url,
}
)

datasource["files"] = files
remove_key_if_present(datasource, "urls")

def _get_file_type(self, url: str):
try:
file_type = SUPERRAG_MIME_TYPE_TO_EXTENSION[get_mimetype_from_url(url)]
except KeyError:
supported_file_types = ", ".join(
value for value in SUPERRAG_MIME_TYPE_TO_EXTENSION.values()
)
raise UnkownFileType(
f"Unknown file type for URL {url}"
f"Supported file types are: {supported_file_types}"
)

return file_type
6 changes: 6 additions & 0 deletions libs/superagent/app/api/workflow_configs/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class MissingVectorDatabaseProvider(Exception):
pass


class UnkownFileType(Exception):
pass
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from app.api.workflow_configs.api.api_agent_manager import ApiAgentManager
from app.api.workflow_configs.api.api_manager import ApiManager
from app.api.workflow_configs.data_transformer import (
from app.api.workflow_configs.exceptions import (
MissingVectorDatabaseProvider,
UnkownFileType,
)
Expand Down
88 changes: 76 additions & 12 deletions libs/superagent/services/superrag.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,75 @@
from enum import Enum
from typing import Optional
from urllib.parse import unquote, urlparse

import aiohttp
import requests
from decouple import config
from pydantic import BaseModel

from app.api.workflow_configs.exceptions import UnkownFileType


# Source https://github.com/superagent-ai/super-rag/blob/bb1630be981b075d8be0585adfc2e697105f510f/models/file.py
class FileType(Enum):
pdf = "PDF"
docx = "DOCX"
txt = "TXT"
pptx = "PPTX"
md = "MARKDOWN"
csv = "CSV"
xlsx = "XLSX"
html = "HTML"
json = "JSON"
eml = "EML"
msg = "MSG"

def suffix(self) -> str:
suffixes = {
"TXT": ".txt",
"PDF": ".pdf",
"MARKDOWN": ".md",
"DOCX": ".docx",
"CSV": ".csv",
"XLSX": ".xlsx",
"PPTX": ".pptx",
"HTML": ".html",
"JSON": ".json",
"EML": ".eml",
"MSG": ".msg",
}
return suffixes[self.value]


class File(BaseModel):
url: str
name: str | None = None

@property
def type(self) -> FileType | None:
url = self.url
if url:
parsed_url = urlparse(url)
path = unquote(parsed_url.path)
extension = path.split(".")[-1].lower()
try:
return FileType[extension]
except KeyError:
supported_file_types = ", ".join(
[file_type.value for file_type in FileType]
)
raise UnkownFileType(
f"Unknown file type for URL {url}. "
f"Supported file types are: {supported_file_types}"
)
return None

@property
def suffix(self) -> str:
file_type = self.type
if file_type is not None:
return file_type.suffix()
else:
raise ValueError("File type is undefined, cannot determine suffix.")


class SuperRagService:
Expand All @@ -12,25 +79,22 @@ def __init__(self, url: Optional[str] = None):
if not self.url:
raise ValueError("SUPERRAG_API_URL is not set")

async def _arequest(self, method, endpoint, data):
async with aiohttp.ClientSession() as session:
async with session.request(
method, f"{self.url}/{endpoint}", json=data
) as response:
return await response.json()

def _request(self, method, endpoint, data):
return requests.request(method, f"{self.url}/{endpoint}", json=data).json()

async def aingest(self, data):
return await self._arequest(
def ingest(self, data):
return self._request(
"POST",
"ingest",
data,
)

async def adelete(self, data):
return await self._arequest("DELETE", "delete", data)
def delete(self, data):
return self._request(
"DELETE",
"delete",
data,
)

def query(self, data):
return self._request(
Expand Down