Skip to content
This repository was archived by the owner on Feb 15, 2025. It is now read-only.

Commit 1c0909c

Browse files
authored
Merge branch 'main' into justin-the-law
2 parents d171516 + e679ad2 commit 1c0909c

File tree

2 files changed

+38
-38
lines changed

2 files changed

+38
-38
lines changed

src/leapfrogai_api/backend/rag/index.py

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -319,35 +319,40 @@ async def aadd_documents(
319319
documents: list[Document],
320320
vector_store_id: str,
321321
file_id: str,
322+
batch_size: int = 100,
322323
) -> list[str]:
323-
"""Adds documents to the vector store.
324-
324+
"""Adds documents to the vector store in batches.
325325
Args:
326326
documents (list[Document]): A list of Langchain Document objects to be added.
327327
vector_store_id (str): The ID of the vector store where the documents will be added.
328328
file_id (str): The ID of the file associated with the documents.
329-
329+
batch_size (int): The size of the batches that will be pushed to the db. This value defaults to 100
330+
as a balance between the memory impact of large files and performance improvements from batching.
330331
Returns:
331332
List[str]: A list of IDs assigned to the added documents.
332-
333333
Raises:
334334
Any exceptions that may occur during the execution of the method.
335-
336335
"""
337-
ids = [] # Initialize the ids list
336+
ids = []
338337
embeddings = await self.embeddings.aembed_documents(
339338
texts=[document.page_content for document in documents]
340339
)
341340

341+
vectors = []
342342
for document, embedding in zip(documents, embeddings):
343-
response = await self._aadd_vector(
344-
vector_store_id=vector_store_id,
345-
file_id=file_id,
346-
content=document.page_content,
347-
metadata=document.metadata,
348-
embedding=embedding,
343+
vector = {
344+
"content": document.page_content,
345+
"metadata": document.metadata,
346+
"embedding": embedding,
347+
}
348+
vectors.append(vector)
349+
350+
for i in range(0, len(vectors), batch_size):
351+
batch = vectors[i : i + batch_size]
352+
response = await self._aadd_vectors(
353+
vector_store_id=vector_store_id, file_id=file_id, vectors=batch
349354
)
350-
ids.append(response[0]["id"])
355+
ids.extend([item["id"] for item in response])
351356

352357
return ids
353358

@@ -418,39 +423,34 @@ async def _adelete_vector(
418423
)
419424
return response
420425

421-
async def _aadd_vector(
422-
self,
423-
vector_store_id: str,
424-
file_id: str,
425-
content: str,
426-
metadata: str,
427-
embedding: list[float],
426+
async def _aadd_vectors(
427+
self, vector_store_id: str, file_id: str, vectors: list[dict[str, any]]
428428
) -> dict:
429-
"""Add a vector to the vector store.
429+
"""Add multiple vectors to the vector store in a batch.
430430
431431
Args:
432432
vector_store_id (str): The ID of the vector store.
433-
file_id (str): The ID of the file associated with the vector.
434-
content (str): The content of the vector.
435-
metadata (str): The metadata associated with the vector.
436-
embedding (list[float]): The embedding of the vector.
433+
file_id (str): The ID of the file associated with the vectors.
434+
vectors (list[dict]): A list of dictionaries containing vector data.
437435
438436
Returns:
439-
dict: The response from the database after inserting the vector.
440-
437+
dict: The response from the database after inserting the vectors.
441438
"""
442-
443439
user_id: str = (await self.db.auth.get_user()).user.id
444440

445-
row: dict[str, any] = {
446-
"user_id": user_id,
447-
"vector_store_id": vector_store_id,
448-
"file_id": file_id,
449-
"content": content,
450-
"metadata": metadata,
451-
"embedding": embedding,
452-
}
453-
data, _count = await self.db.from_(self.table_name).insert(row).execute()
441+
rows = []
442+
for vector in vectors:
443+
row = {
444+
"user_id": user_id,
445+
"vector_store_id": vector_store_id,
446+
"file_id": file_id,
447+
"content": vector["content"],
448+
"metadata": vector["metadata"],
449+
"embedding": vector["embedding"],
450+
}
451+
rows.append(row)
452+
453+
data, _count = await self.db.from_(self.table_name).insert(rows).execute()
454454

455455
_, response = data
456456

src/leapfrogai_api/data/crud_file_bucket.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ async def upload(self, file: UploadFile, id_: str):
1515
"""Upload a file to the file bucket."""
1616

1717
return await self.client.storage.from_("file_bucket").upload(
18-
file=file.file.read(), path=f"{id_}"
18+
file=await file.read(), path=f"{id_}"
1919
)
2020

2121
async def download(self, id_: str):

0 commit comments

Comments
 (0)