Skip to content

Commit

Permalink
feat: Semantic Spliter + minor improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
simjak committed Feb 25, 2024
1 parent 8264094 commit 542d8db
Show file tree
Hide file tree
Showing 17 changed files with 832 additions and 427 deletions.
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cSpell.words": [
"tiktoken",
"Upserted"
]
}
20 changes: 13 additions & 7 deletions api/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from models.ingest import RequestPayload
from service.embedding import EmbeddingService, get_encoder
from service.ingest import handle_urls, handle_google_drive
from service.ingest import handle_google_drive, handle_urls
from utils.summarise import SUMMARY_SUFFIX

router = APIRouter()
Expand All @@ -16,23 +16,29 @@
async def ingest(payload: RequestPayload) -> Dict:
encoder = get_encoder(encoder_config=payload.encoder)
embedding_service = EmbeddingService(
encoder=encoder,
index_name=payload.index_name,
vector_credentials=payload.vector_database,
dimensions=payload.encoder.dimensions,
)
chunks = []
summary_documents = []
if payload.files:
chunks, summary_documents = await handle_urls(embedding_service, payload.files)
chunks, summary_documents = await handle_urls(
embedding_service, payload.files, payload.chunk_config
)

elif payload.google_drive:
chunks, summary_documents = await handle_google_drive(
embedding_service, payload.google_drive
)
) # type: ignore TODO: Fix typing

await asyncio.gather(
embedding_service.generate_and_upsert_embeddings(
documents=chunks, encoder=encoder, index_name=payload.index_name
embedding_service.embed_and_upsert(
chunks=chunks, encoder=encoder, index_name=payload.index_name
),
embedding_service.generate_and_upsert_embeddings(
documents=summary_documents,
embedding_service.embed_and_upsert(
chunks=summary_documents,
encoder=encoder,
index_name=f"{payload.index_name}{SUMMARY_SUFFIX}",
),
Expand Down
11 changes: 3 additions & 8 deletions api/query.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from fastapi import APIRouter

from models.query import RequestPayload, ResponseData, ResponsePayload
from models.query import RequestPayload, ResponsePayload
from service.router import query as _query

router = APIRouter()
Expand All @@ -9,10 +9,5 @@
@router.post("/query", response_model=ResponsePayload)
async def query(payload: RequestPayload):
chunks = await _query(payload=payload)
response_data = [
ResponseData(
content=chunk.content, doc_url=chunk.doc_url, page_number=chunk.page_number
)
for chunk in chunks
]
return {"success": True, "data": response_data}
# NOTE: Filter out fields before given to LLM
return ResponsePayload(success=True, data=chunks)
2 changes: 1 addition & 1 deletion dev/embedding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
"metadata": {},
"outputs": [],
"source": [
"elements = await embedding_service._download_and_extract_elements(file, strategy=\"auto\")\n"
"elements = await embedding_service._partition_file(file, strategy=\"auto\")\n"
]
},
{
Expand Down
190 changes: 156 additions & 34 deletions dev/walkthrough.ipynb

Large diffs are not rendered by default.

Empty file added models/__init__.py
Empty file.
57 changes: 52 additions & 5 deletions models/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,72 @@ class BaseDocument(BaseModel):

class BaseDocumentChunk(BaseModel):
id: str
doc_url: str
document_id: str
content: str
doc_url: str
source: str
source_type: str
chunk_index: int | None = None
title: str | None = None
token_count: int | None = None
page_number: int | None = None
metadata: dict | None = None
dense_embedding: Optional[List[float]] = None

@classmethod
def from_metadata(cls, metadata: dict):
exclude_keys = {
"chunk_id",
"chunk_index",
"document_id",
"doc_url",
"content",
"source",
"source_type",
"title",
"token_count",
"page_number",
}
filtered_metadata = {k: v for k, v in metadata.items() if k not in exclude_keys}
return cls(
id=metadata.get("chunk_id", ""),
**metadata,
metadata=filtered_metadata,
dense_embedding=metadata.get("values"),
)

@validator("id")
def id_must_be_valid_uuid(cls, v): # noqa: F841
def id_must_be_valid_uuid(cls, v):
try:
uuid_obj = uuid.UUID(v, version=4)
return str(uuid_obj)
except ValueError:
raise ValueError("id must be a valid UUID")
raise ValueError(f"id must be a valid UUID, got {v}")

@validator("dense_embedding")
def embeddings_must_be_list_of_floats(cls, v): # noqa: F841
def embeddings_must_be_list_of_floats(cls, v):
if v is None:
return v # Allow None to pass through
if not all(isinstance(item, float) for item in v):
raise ValueError("embeddings must be a list of floats")
raise ValueError(f"embeddings must be a list of floats, got {v}")
return v

def to_vector_db(self):
metadata = {
"chunk_id": self.id,
"chunk_index": self.chunk_index or "",
"document_id": self.document_id,
"doc_url": self.doc_url,
"content": self.content,
"source": self.source,
"source_type": self.source_type,
"title": self.title or "",
"token_count": self.token_count,
**(self.metadata or {}),
}
result = {
"id": self.id,
"values": self.dense_embedding,
"metadata": metadata,
}
return result
22 changes: 14 additions & 8 deletions models/file.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from enum import Enum
from urllib.parse import unquote, urlparse

from pydantic import BaseModel, validator
from pydantic import BaseModel


class FileType(Enum):
Expand Down Expand Up @@ -32,13 +32,11 @@ def suffix(self) -> str:

class File(BaseModel):
url: str
type: FileType | None = None
name: str | None = None

@validator("type", pre=True, always=True)
def set_type_from_url(cls, v, values): # noqa: F841
if v is not None:
return v
url = values.get("url")
@property
def type(self) -> FileType | None:
url = self.url
if url:
parsed_url = urlparse(url)
path = unquote(parsed_url.path)
Expand All @@ -47,4 +45,12 @@ def set_type_from_url(cls, v, values): # noqa: F841
return FileType[extension]
except KeyError:
raise ValueError(f"Unsupported file type for URL: {url}")
return v
return None

@property
def suffix(self) -> str:
file_type = self.type
if file_type is not None:
return file_type.suffix()
else:
raise ValueError("File type is undefined, cannot determine suffix.")
28 changes: 21 additions & 7 deletions models/ingest.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,24 @@
from enum import Enum
from typing import List, Optional
from typing import List, Literal, Optional

from pydantic import BaseModel
from pydantic import BaseModel, Field

from models.file import File
from models.vector_database import VectorDatabase
from models.google_drive import GoogleDrive
from models.vector_database import VectorDatabase


class ChunkConfig(BaseModel):
partition_strategy: Literal["auto", "hi_res"] = "auto"
split_method: Literal["by_title", "semantic"] = "by_title"
min_chunk_tokens: int = Field(50, description="Only for `semantic` method")
max_token_size: int = Field(300, description="Only for `semantic` method")
rolling_window_size: int = Field(
1,
description=(
"Only for `semantic` method. Compares each element with the previous one"
),
)


class EncoderEnum(str, Enum):
Expand All @@ -15,14 +28,15 @@ class EncoderEnum(str, Enum):

class Encoder(BaseModel):
name: str
type: str
provider: str
dimensions: Optional[int] = None


class RequestPayload(BaseModel):
files: Optional[List[File]] = None
google_drive: Optional[GoogleDrive] = None
index_name: str
encoder: Encoder
vector_database: VectorDatabase
index_name: str
chunk_config: ChunkConfig
files: Optional[List[File]] = None
google_drive: Optional[GoogleDrive] = None
webhook_url: Optional[str] = None
11 changes: 6 additions & 5 deletions models/query.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import List, Optional

from pydantic import BaseModel
from models.document import BaseDocumentChunk

from models.ingest import Encoder
from models.vector_database import VectorDatabase
Expand All @@ -14,12 +15,12 @@ class RequestPayload(BaseModel):
session_id: Optional[str] = None


class ResponseData(BaseModel):
content: str
doc_url: str
page_number: Optional[int]
# class ResponseData(BaseModel):
# content: str
# doc_url: str
# page_number: Optional[int]


class ResponsePayload(BaseModel):
success: bool
data: List[ResponseData]
data: List[BaseDocumentChunk]
Loading

0 comments on commit 542d8db

Please sign in to comment.