Skip to content

Commit

Permalink
Feat/remove high res (#54)
Browse files Browse the repository at this point in the history
* Add concurrency to embedding generation

* Fix formatting

* Fix issue with summarizing documents
  • Loading branch information
homanp authored Feb 22, 2024
1 parent fca03bb commit 4e43556
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 30 deletions.
131 changes: 120 additions & 11 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ black = "^23.12.1"
flake8 = "^7.0.0"
vulture = "^2.11"
python-decouple = "^3.8"
semantic-router = "^0.0.20"
semantic-router = {extras = ["fastembed"], version = "^0.0.22"}
astrapy = "^0.7.4"
openai = "^1.12.0"
tqdm = "^4.66.2"
Expand Down
50 changes: 35 additions & 15 deletions service/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from semantic_router.encoders import (
BaseEncoder,
CohereEncoder,
FastEmbedEncoder,
HuggingFaceEncoder,
OpenAIEncoder,
)
Expand Down Expand Up @@ -69,7 +70,10 @@ async def _download_and_extract_elements(
temp_file.write(response.content)
temp_file.flush()
elements = partition(
file=temp_file, include_page_breaks=True, strategy=strategy
file=temp_file,
include_page_breaks=True,
strategy=strategy,
skip_infer_table_types=["pdf"],
)
return elements

Expand Down Expand Up @@ -107,7 +111,7 @@ async def generate_chunks(
if not document:
continue
chunks = chunk_by_title(
elements, max_characters=500, combine_text_under_n_chars=0
elements, max_characters=1500, new_after_n_chars=1000
)
for chunk in chunks:
# Ensure all metadata values are of a type acceptable
Expand Down Expand Up @@ -150,15 +154,17 @@ async def generate_and_upsert_embeddings(
index_name: Optional[str] = None,
) -> List[BaseDocumentChunk]:
pbar = tqdm(total=len(documents), desc="Generating embeddings")
sem = asyncio.Semaphore(10) # Limit to 10 concurrent tasks

async def safe_generate_embedding(
chunk: BaseDocumentChunk,
) -> BaseDocumentChunk | None:
try:
return await generate_embedding(chunk)
except Exception as e:
logger.error(f"Error embedding document {chunk.id}: {e}")
return None
async with sem:
try:
return await generate_embedding(chunk)
except Exception as e:
logger.error(f"Error embedding document {chunk.id}: {e}")
return None

async def generate_embedding(
chunk: BaseDocumentChunk,
Expand All @@ -167,8 +173,6 @@ async def generate_embedding(
embeddings: List[np.ndarray] = [
np.array(e) for e in encoder([chunk.content])
]

logger.info(f"Embedding: {chunk.id}, metadata: {chunk.metadata}")
chunk.dense_embedding = embeddings[0].tolist()
pbar.update()
return chunk
Expand All @@ -191,23 +195,38 @@ async def generate_embedding(

return chunks_with_embeddings

# TODO: Do we summarize the documents or chunks here?
async def generate_summary_documents(
self, documents: List[BaseDocumentChunk]
) -> List[BaseDocumentChunk]:
pbar = tqdm(total=len(documents), desc="Summarizing documents")
pbar = tqdm(total=len(documents), desc="Grouping chunks")
pages = {}
for document in documents:
page_number = document.metadata.get("page_number", None)
if page_number not in pages:
doc = copy.deepcopy(document)
doc.content = await completion(document=doc)
pages[page_number] = doc
pages[page_number] = copy.deepcopy(document)
else:
pages[page_number].content += document.content
pbar.update()
pbar.close()
summary_documents = list(pages.values())

# Limit to 10 concurrent jobs
sem = asyncio.Semaphore(10)

async def safe_completion(document: BaseDocumentChunk) -> BaseDocumentChunk:
async with sem:
try:
document.content = await completion(document=document)
pbar.update()
return document
except Exception as e:
logger.error(f"Error summarizing document {document.id}: {e}")
return None

pbar = tqdm(total=len(pages), desc="Summarizing documents")
tasks = [safe_completion(document) for document in pages.values()]
summary_documents = await asyncio.gather(*tasks, return_exceptions=False)
pbar.close()

return summary_documents


Expand All @@ -216,6 +235,7 @@ def get_encoder(*, encoder_config: Encoder) -> BaseEncoder:
EncoderEnum.cohere: CohereEncoder,
EncoderEnum.openai: OpenAIEncoder,
EncoderEnum.huggingface: HuggingFaceEncoder,
EncoderEnum.fastembed: FastEmbedEncoder,
}
encoder_provider = encoder_config.type
encoder = encoder_config.name
Expand Down
2 changes: 1 addition & 1 deletion service/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def create_route_layer() -> RouteLayer:
async def get_documents(
*, vector_service: BaseVectorDatabase, payload: RequestPayload
) -> list[BaseDocumentChunk]:
chunks = await vector_service.query(input=payload.input, top_k=25)
chunks = await vector_service.query(input=payload.input, top_k=5)

if not len(chunks):
logger.error(f"No documents found for query: {payload.input}")
Expand Down
2 changes: 1 addition & 1 deletion vectordbs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get_vector_service(
index_name: str,
credentials: VectorDatabase,
encoder: BaseEncoder = OpenAIEncoder(),
dimensions: Optional[int] = None,
dimensions: Optional[int] = 384,
) -> BaseVectorDatabase:
services = {
"pinecone": PineconeService,
Expand Down
4 changes: 3 additions & 1 deletion vectordbs/qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from models.document import BaseDocumentChunk
from vectordbs.base import BaseVectorDatabase

MAX_QUERY_TOP_K = 5


class QdrantService(BaseVectorDatabase):
def __init__(
Expand Down Expand Up @@ -66,7 +68,7 @@ async def upsert(self, chunks: List[BaseDocumentChunk]) -> None:

self.client.upsert(collection_name=self.index_name, wait=True, points=points)

async def query(self, input: str, top_k: int) -> List:
async def query(self, input: str, top_k: int = MAX_QUERY_TOP_K) -> List:
vectors = await self._generate_vectors(input=input)
search_result = self.client.search(
collection_name=self.index_name,
Expand Down

0 comments on commit 4e43556

Please sign in to comment.