Feat/remove high res (#54)

* Add concurrency to embedding generation * Fix formatting * Fix issue with summarizing documents
superagent-ai · Feb 22, 2024 · 4e43556 · 4e43556
1 parent fca03bb
commit 4e43556
Show file tree

Hide file tree

Showing 6 changed files with 161 additions and 30 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,7 @@ black = "^23.12.1"
 flake8 = "^7.0.0"
 vulture = "^2.11"
 python-decouple = "^3.8"
-semantic-router = "^0.0.20"
+semantic-router = {extras = ["fastembed"], version = "^0.0.22"}
 astrapy = "^0.7.4"
 openai = "^1.12.0"
 tqdm = "^4.66.2"

diff --git a/service/embedding.py b/service/embedding.py
@@ -9,6 +9,7 @@
 from semantic_router.encoders import (
     BaseEncoder,
     CohereEncoder,
+    FastEmbedEncoder,
     HuggingFaceEncoder,
     OpenAIEncoder,
 )
@@ -69,7 +70,10 @@ async def _download_and_extract_elements(
                 temp_file.write(response.content)
                 temp_file.flush()
             elements = partition(
-                file=temp_file, include_page_breaks=True, strategy=strategy
+                file=temp_file,
+                include_page_breaks=True,
+                strategy=strategy,
+                skip_infer_table_types=["pdf"],
             )
         return elements
 
@@ -107,7 +111,7 @@ async def generate_chunks(
                 if not document:
                     continue
                 chunks = chunk_by_title(
-                    elements, max_characters=500, combine_text_under_n_chars=0
+                    elements, max_characters=1500, new_after_n_chars=1000
                 )
                 for chunk in chunks:
                     # Ensure all metadata values are of a type acceptable
@@ -150,15 +154,17 @@ async def generate_and_upsert_embeddings(
         index_name: Optional[str] = None,
     ) -> List[BaseDocumentChunk]:
         pbar = tqdm(total=len(documents), desc="Generating embeddings")
+        sem = asyncio.Semaphore(10)  # Limit to 10 concurrent tasks
 
         async def safe_generate_embedding(
             chunk: BaseDocumentChunk,
         ) -> BaseDocumentChunk | None:
-            try:
-                return await generate_embedding(chunk)
-            except Exception as e:
-                logger.error(f"Error embedding document {chunk.id}: {e}")
-                return None
+            async with sem:
+                try:
+                    return await generate_embedding(chunk)
+                except Exception as e:
+                    logger.error(f"Error embedding document {chunk.id}: {e}")
+                    return None
 
         async def generate_embedding(
             chunk: BaseDocumentChunk,
@@ -167,8 +173,6 @@ async def generate_embedding(
                 embeddings: List[np.ndarray] = [
                     np.array(e) for e in encoder([chunk.content])
                 ]
-
-                logger.info(f"Embedding: {chunk.id}, metadata: {chunk.metadata}")
                 chunk.dense_embedding = embeddings[0].tolist()
                 pbar.update()
                 return chunk
@@ -191,23 +195,38 @@ async def generate_embedding(
 
         return chunks_with_embeddings
 
-    # TODO: Do we summarize the documents or chunks here?
     async def generate_summary_documents(
         self, documents: List[BaseDocumentChunk]
     ) -> List[BaseDocumentChunk]:
-        pbar = tqdm(total=len(documents), desc="Summarizing documents")
+        pbar = tqdm(total=len(documents), desc="Grouping chunks")
         pages = {}
         for document in documents:
             page_number = document.metadata.get("page_number", None)
             if page_number not in pages:
-                doc = copy.deepcopy(document)
-                doc.content = await completion(document=doc)
-                pages[page_number] = doc
+                pages[page_number] = copy.deepcopy(document)
             else:
                 pages[page_number].content += document.content
             pbar.update()
         pbar.close()
-        summary_documents = list(pages.values())
+
+        # Limit to 10 concurrent jobs
+        sem = asyncio.Semaphore(10)
+
+        async def safe_completion(document: BaseDocumentChunk) -> BaseDocumentChunk:
+            async with sem:
+                try:
+                    document.content = await completion(document=document)
+                    pbar.update()
+                    return document
+                except Exception as e:
+                    logger.error(f"Error summarizing document {document.id}: {e}")
+                    return None
+
+        pbar = tqdm(total=len(pages), desc="Summarizing documents")
+        tasks = [safe_completion(document) for document in pages.values()]
+        summary_documents = await asyncio.gather(*tasks, return_exceptions=False)
+        pbar.close()
+
         return summary_documents
 
 
@@ -216,6 +235,7 @@ def get_encoder(*, encoder_config: Encoder) -> BaseEncoder:
         EncoderEnum.cohere: CohereEncoder,
         EncoderEnum.openai: OpenAIEncoder,
         EncoderEnum.huggingface: HuggingFaceEncoder,
+        EncoderEnum.fastembed: FastEmbedEncoder,
     }
     encoder_provider = encoder_config.type
     encoder = encoder_config.name

diff --git a/service/router.py b/service/router.py
@@ -34,7 +34,7 @@ def create_route_layer() -> RouteLayer:
 async def get_documents(
     *, vector_service: BaseVectorDatabase, payload: RequestPayload
 ) -> list[BaseDocumentChunk]:
-    chunks = await vector_service.query(input=payload.input, top_k=25)
+    chunks = await vector_service.query(input=payload.input, top_k=5)
 
     if not len(chunks):
         logger.error(f"No documents found for query: {payload.input}")

diff --git a/vectordbs/__init__.py b/vectordbs/__init__.py
@@ -19,7 +19,7 @@ def get_vector_service(
     index_name: str,
     credentials: VectorDatabase,
     encoder: BaseEncoder = OpenAIEncoder(),
-    dimensions: Optional[int] = None,
+    dimensions: Optional[int] = 384,
 ) -> BaseVectorDatabase:
     services = {
         "pinecone": PineconeService,

diff --git a/vectordbs/qdrant.py b/vectordbs/qdrant.py
@@ -8,6 +8,8 @@
 from models.document import BaseDocumentChunk
 from vectordbs.base import BaseVectorDatabase
 
+MAX_QUERY_TOP_K = 5
+
 
 class QdrantService(BaseVectorDatabase):
     def __init__(
@@ -66,7 +68,7 @@ async def upsert(self, chunks: List[BaseDocumentChunk]) -> None:
 
         self.client.upsert(collection_name=self.index_name, wait=True, points=points)
 
-    async def query(self, input: str, top_k: int) -> List:
+    async def query(self, input: str, top_k: int = MAX_QUERY_TOP_K) -> List:
         vectors = await self._generate_vectors(input=input)
         search_result = self.client.search(
             collection_name=self.index_name,