Skip to content

Commit

Permalink
feat: Semantic splitter (#63)
Browse files Browse the repository at this point in the history
* feat: Semantic Spliter + minor improvements

* chore: testing

* fix: BaseDocumentChunk output fix

* feat: Combined chunk title with chunk content

* chore: Updated semantic router version

* chore: Updated semantic router version

* Small tweaks and bug fixes

* Update README

* chore: Merging

* chore: Testing

* chore: Minor improvements

* Add support for querying code interpreter (#66)

* Add support for queryig code interpreter

* Fix formatting

* Ensure the sandbox close is called on exceptions

* Update service/code_interpreter.py

Co-authored-by: Tomas Valenta <valenta.and.thomas@gmail.com>

* Update service/code_interpreter.py

Co-authored-by: Tomas Valenta <valenta.and.thomas@gmail.com>

* Update service/router.py

Co-authored-by: Tomas Valenta <valenta.and.thomas@gmail.com>

* Update service/code_interpreter.py

Co-authored-by: Tomas Valenta <valenta.and.thomas@gmail.com>

* Add system prompt

* Format code

* Bump dependencies

* Minor tweaks

---------

Co-authored-by: Tomas Valenta <valenta.and.thomas@gmail.com>

* Minor tweaks

---------

Co-authored-by: Ismail Pelaseyed <homanp@gmail.com>
Co-authored-by: Tomas Valenta <valenta.and.thomas@gmail.com>
  • Loading branch information
3 people authored Mar 2, 2024
1 parent db5c57a commit ae3b113
Show file tree
Hide file tree
Showing 22 changed files with 803 additions and 272 deletions.
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cSpell.words": [
"tiktoken",
"Upserted"
]
}
47 changes: 33 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,23 +66,41 @@ Super-Rag comes with a built in REST API powered by FastApi.
// Payload
{
"files": [{
"url": "https://arxiv.org/pdf/2210.03629.pdf"
}],
"files": [
{
"name": "My file", // Optional
"url": "https://path-to-my-file.pdf"
}
],
"document_processor": { // Optional
"encoder": {
"dimensions": 384,
"model_name": "embed-multilingual-light-v3.0",
"provider": "cohere"
},
"unstructured": {
"hi_res_model_name": "detectron2_onnx",
"partition_strategy": "auto",
"process_tables": false
},
"splitter": {
"max_tokens": 400,
"min_tokens": 30,
"name": "semantic",
"prefix_summary": true,
"prefix_title": true,
"rolling_window_size": 1
}
},
"vector_database": {
"type": "qdrant",
"config": {
"api_key": "YOUR API KEY",
"host": "THE QDRANT HOST"
}
},
"encoder": {
"type": "openai",
"name": "text-embedding-3-small",
"dimensions": 1536 // encoder depends on the provider and model
},
"index_name": "YOUR INDEX",
"webhook_url": "https://webhook.site/0e217d1c-49f1-424a-9992-497db09f7793"
"index_name": "my_index",
"webhook_url": "https://my-webhook-url"
}
```

Expand All @@ -103,12 +121,13 @@ Super-Rag comes with a built in REST API powered by FastApi.
"index_name": "YOUR INDEX",
"interpreter_mode": true,
"encoder": {
"type": "cohere",
"name": "embed-multilingual-light-v3.0",
"provider": "openai",
"name": "text-embedding-3-small",
"dimensions": 384
},
"exclude_fields": ["metadata"],
"session_id": "test"
"exclude_fields": ["metadata"], // Exclude specific fields
"interpreter_mode": False, // Set to True if you wish to run computation Q&A with a code interpreter
"session_id": "my_session_id" // keeps micro-vm sessions and enables caching
}
```
Expand Down
5 changes: 2 additions & 3 deletions api/delete.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from fastapi import APIRouter

from models.delete import RequestPayload, ResponsePayload
from service.embedding import get_encoder
from vectordbs import get_vector_service
from vectordbs.base import BaseVectorDatabase

Expand All @@ -10,12 +9,12 @@

@router.delete("/delete", response_model=ResponsePayload)
async def delete(payload: RequestPayload):
encoder = get_encoder(encoder_config=payload.encoder)
encoder = payload.encoder.get_encoder()
vector_service: BaseVectorDatabase = get_vector_service(
index_name=payload.index_name,
credentials=payload.vector_database,
encoder=encoder,
dimensions=encoder.dimensions,
dimensions=payload.encoder.dimensions,
)

for file in payload.files:
Expand Down
26 changes: 17 additions & 9 deletions api/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from fastapi import APIRouter

from models.ingest import RequestPayload
from service.embedding import EmbeddingService, get_encoder
from service.embedding import EmbeddingService
from service.ingest import handle_google_drive, handle_urls
from utils.summarise import SUMMARY_SUFFIX

Expand All @@ -14,25 +14,33 @@

@router.post("/ingest")
async def ingest(payload: RequestPayload) -> Dict:
encoder = get_encoder(encoder_config=payload.encoder)
encoder = payload.document_processor.encoder.get_encoder()
embedding_service = EmbeddingService(
encoder=encoder,
index_name=payload.index_name,
vector_credentials=payload.vector_database,
dimensions=payload.encoder.dimensions,
dimensions=payload.document_processor.encoder.dimensions,
)
chunks = []
summary_documents = []
if payload.files:
chunks, summary_documents = await handle_urls(embedding_service, payload.files)
chunks, summary_documents = await handle_urls(
embedding_service=embedding_service,
files=payload.files,
config=payload.document_processor,
)

elif payload.google_drive:
chunks, summary_documents = await handle_google_drive(
embedding_service, payload.google_drive
)
) # type: ignore TODO: Fix typing

await asyncio.gather(
embedding_service.generate_and_upsert_embeddings(
documents=chunks, encoder=encoder, index_name=payload.index_name
embedding_service.embed_and_upsert(
chunks=chunks, encoder=encoder, index_name=payload.index_name
),
embedding_service.generate_and_upsert_embeddings(
documents=summary_documents,
embedding_service.embed_and_upsert(
chunks=summary_documents,
encoder=encoder,
index_name=f"{payload.index_name}{SUMMARY_SUFFIX}",
),
Expand Down
10 changes: 7 additions & 3 deletions api/query.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from fastapi import APIRouter

from models.query import RequestPayload, ResponseData, ResponsePayload
from models.query import RequestPayload, ResponsePayload
from service.router import query as _query

router = APIRouter()
Expand All @@ -9,5 +9,9 @@
@router.post("/query", response_model=ResponsePayload)
async def query(payload: RequestPayload):
chunks = await _query(payload=payload)
response_data = [ResponseData(**chunk.model_dump()) for chunk in chunks]
return {"success": True, "data": response_data}
# NOTE: Filter out fields before given to LLM
response_payload = ResponsePayload(success=True, data=chunks)
response_data = response_payload.model_dump(
exclude=set(payload.exclude_fields) if payload.exclude_fields else None
)
return response_data
2 changes: 1 addition & 1 deletion dev/embedding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
"metadata": {},
"outputs": [],
"source": [
"elements = await embedding_service._download_and_extract_elements(file, strategy=\"auto\")\n"
"elements = await embedding_service._partition_file(file, strategy=\"auto\")\n"
]
},
{
Expand Down
Loading

0 comments on commit ae3b113

Please sign in to comment.