Skip to content

Commit

Permalink
feat: Encoders (#28)
Browse files Browse the repository at this point in the history
* chore: Poetry + precommit

* chore: Poetry

* feat: Encoders

* chore: Clean notebook outputs

* feat: Added encoders to query

* chore: Added a note to fix delete for Pinecone

* chore: Linting fix

* fix: Fix Pinecone deletion by file_url

* fix: Pinecone delete

* Small tweaks

* Fix linting

---------

Co-authored-by: Ismail Pelaseyed <homanp@gmail.com>
  • Loading branch information
simjak and homanp committed Feb 11, 2024
1 parent 3ded918 commit 7c9b3b2
Show file tree
Hide file tree
Showing 21 changed files with 710 additions and 59 deletions.
7 changes: 6 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
API_BASE_URL=https://rag.superagent.sh
OPENAI_API_KEY=
COHERE_API_KEY=
OPENAI_API_KEY=

# Optional for walkthrough
PINECONE_API_KEY=
PINECONE_HOST=
PINECONE_INDEX=
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.venv
.env
__pycache__/
```
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SuperRag

Super-performant RAG pipeline for AI Agents/Assistants.
Super-performant RAG pipeline for AI Agents/Assistants.

## API

Expand All @@ -23,6 +23,7 @@ Input example:
}
},
"index_name": "my_index",
"encoder": "my_encoder"
"webhook_url": "https://my-webhook-url"
}
```
Expand All @@ -41,6 +42,7 @@ Input example:
}
},
"index_name": "my_index",
"encoder": "my_encoder",
}
```

Expand Down
14 changes: 11 additions & 3 deletions api/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from fastapi import APIRouter

from models.ingest import RequestPayload
from service.embedding import EmbeddingService
from service.embedding import EmbeddingService, get_encoder

router = APIRouter()

Expand All @@ -18,6 +18,10 @@ async def ingest(payload: RequestPayload) -> Dict:
vector_credentials=payload.vector_database,
)
documents = await embedding_service.generate_documents()
chunks = await embedding_service.generate_chunks(documents=documents)

encoder = get_encoder(encoder_type=payload.encoder)

summary_documents = await embedding_service.generate_summary_documents(
documents=documents
)
Expand All @@ -27,9 +31,13 @@ async def ingest(payload: RequestPayload) -> Dict:
)

await asyncio.gather(
embedding_service.generate_embeddings(nodes=chunks),
embedding_service.generate_embeddings(
nodes=summary_chunks, index_name=f"{payload.index_name}summary"
nodes=chunks, encoder=encoder, index_name=payload.index_name
),
embedding_service.generate_embeddings(
nodes=summary_chunks,
encoder=encoder,
index_name=f"{payload.index_name}-summary",
),
)

Expand Down
132 changes: 132 additions & 0 deletions dev/walkthrough.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"load_dotenv()\n",
"\n",
"API_URL = os.environ.get('API_BASE_URL', 'http://localhost:8000')\n",
"PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '')\n",
"PINECONE_INDEX = os.environ.get('PINECONE_INDEX', '')\n",
"PINECONE_HOST = os.environ.get('PINECONE_HOST', '')\n",
"\n",
"print(\"API_URL:\", API_URL)\n",
"print(\"PINECONE_API_KEY:\", PINECONE_API_KEY)\n",
"print(\"PINECONE_INDEX:\", PINECONE_INDEX)\n",
"print(\"PINECONE_HOST:\", PINECONE_HOST)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Ingest a file\n",
"url = f\"{API_URL}/api/v1/ingest\"\n",
"\n",
"payload = {\n",
" \"files\": [\n",
" {\n",
" \"type\": \"PDF\",\n",
" \"url\": \"https://arxiv.org/pdf/2402.05131.pdf\"\n",
" }\n",
" ],\n",
" \"vector_database\": {\n",
" \"type\": \"pinecone\",\n",
" \"config\": {\n",
" \"api_key\": PINECONE_API_KEY,\n",
" \"host\": PINECONE_HOST,\n",
" }\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
" \"encoder\": \"openai\",\n",
"}\n",
"\n",
"response = requests.post(url, json=payload)\n",
"\n",
"print(response.json())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Query the index\n",
"query_url = f\"{API_URL}/api/v1/query\"\n",
"\n",
"query_payload = {\n",
" \"input\": \"What is the best chunk strategy?\",\n",
" \"vector_database\": {\n",
" \"type\": \"pinecone\",\n",
" \"config\": {\n",
" \"api_key\": PINECONE_API_KEY,\n",
" \"host\": PINECONE_HOST,\n",
" }\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
" \"encoder\": \"openai\",\n",
"}\n",
"\n",
"query_response = requests.post(query_url, json=query_payload)\n",
"\n",
"print(query_response.json())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Delete the index\n",
"query_url = f\"{API_URL}/api/v1/delete\"\n",
"\n",
"delete_payload = {\n",
" \"file_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"vector_database\": {\n",
" \"type\": \"pinecone\",\n",
" \"config\": {\n",
" \"api_key\": PINECONE_API_KEY,\n",
" \"host\": PINECONE_HOST,\n",
" }\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
"}\n",
"\n",
"delete_response = requests.delete(query_url, json=delete_payload)\n",
"\n",
"print(delete_response.json())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
15 changes: 15 additions & 0 deletions encoders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from encoders.base import BaseEncoder
from encoders.bm25 import BM25Encoder
from encoders.cohere import CohereEncoder
from encoders.fastembed import FastEmbedEncoder
from encoders.huggingface import HuggingFaceEncoder
from encoders.openai import OpenAIEncoder

__all__ = [
"BaseEncoder",
"CohereEncoder",
"OpenAIEncoder",
"BM25Encoder",
"FastEmbedEncoder",
"HuggingFaceEncoder",
]
16 changes: 16 additions & 0 deletions encoders/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import List

from pydantic.v1 import BaseModel, Field


class BaseEncoder(BaseModel):
name: str
score_threshold: float
type: str = Field(default="base")
dimension: int = Field(default=1536)

class Config:
arbitrary_types_allowed = True

def __call__(self, docs: List[str]) -> List[List[float]]:
raise NotImplementedError("Subclasses must implement this method")
67 changes: 67 additions & 0 deletions encoders/bm25.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from typing import Any, Dict, List, Optional

from semantic_router.encoders import BaseEncoder
from semantic_router.utils.logger import logger


class BM25Encoder(BaseEncoder):
model: Optional[Any] = None
idx_mapping: Optional[Dict[int, int]] = None
type: str = "sparse"

def __init__(
self,
name: str = "bm25",
score_threshold: float = 0.82,
use_default_params: bool = True,
):
super().__init__(name=name, score_threshold=score_threshold)
try:
from pinecone_text.sparse import BM25Encoder as encoder
except ImportError:
raise ImportError(
"Please install pinecone-text to use BM25Encoder. "
"You can install it with: `pip install 'semantic-router[hybrid]'`"
)

self.model = encoder()

if use_default_params:
logger.info("Downloading and initializing default sBM25 model parameters.")
self.model = encoder.default()
self._set_idx_mapping()

def _set_idx_mapping(self):
params = self.model.get_params()
doc_freq = params["doc_freq"]
if isinstance(doc_freq, dict):
indices = doc_freq["indices"]
self.idx_mapping = {int(idx): i for i, idx in enumerate(indices)}
else:
raise TypeError("Expected a dictionary for 'doc_freq'")

def __call__(self, docs: List[str]) -> List[List[float]]:
if self.model is None or self.idx_mapping is None:
raise ValueError("Model or index mapping is not initialized.")
if len(docs) == 1:
sparse_dicts = self.model.encode_queries(docs)
elif len(docs) > 1:
sparse_dicts = self.model.encode_documents(docs)
else:
raise ValueError("No documents to encode.")

embeds = [[0.0] * len(self.idx_mapping)] * len(docs)
for i, output in enumerate(sparse_dicts):
indices = output["indices"]
values = output["values"]
for idx, val in zip(indices, values):
if idx in self.idx_mapping:
position = self.idx_mapping[idx]
embeds[i][position] = val
return embeds

def fit(self, docs: List[str]):
if self.model is None:
raise ValueError("Model is not initialized.")
self.model.fit(docs)
self._set_idx_mapping()
39 changes: 39 additions & 0 deletions encoders/cohere.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import List, Optional

import cohere
from decouple import config

from encoders import BaseEncoder


class CohereEncoder(BaseEncoder):
client: Optional[cohere.Client] = None
type: str = "cohere"

def __init__(
self,
name: Optional[str] = None,
cohere_api_key: Optional[str] = None,
score_threshold: float = 0.3,
):
if name is None:
name = config("COHERE_MODEL_NAME", "embed-english-v3.0")
super().__init__(name=name, score_threshold=score_threshold)
cohere_api_key = cohere_api_key or config("COHERE_API_KEY")
if cohere_api_key is None:
raise ValueError("Cohere API key cannot be 'None'.")
try:
self.client = cohere.Client(cohere_api_key)
except Exception as e:
raise ValueError(
f"Cohere API client failed to initialize. Error: {e}"
) from e

def __call__(self, docs: List[str]) -> List[List[float]]:
if self.client is None:
raise ValueError("Cohere client is not initialized.")
try:
embeds = self.client.embed(docs, input_type="search_query", model=self.name)
return embeds.embeddings
except Exception as e:
raise ValueError(f"Cohere API call failed. Error: {e}") from e
Loading

0 comments on commit 7c9b3b2

Please sign in to comment.