@@ -319,35 +319,40 @@ async def aadd_documents(
319
319
documents : list [Document ],
320
320
vector_store_id : str ,
321
321
file_id : str ,
322
+ batch_size : int = 100 ,
322
323
) -> list [str ]:
323
- """Adds documents to the vector store.
324
-
324
+ """Adds documents to the vector store in batches.
325
325
Args:
326
326
documents (list[Document]): A list of Langchain Document objects to be added.
327
327
vector_store_id (str): The ID of the vector store where the documents will be added.
328
328
file_id (str): The ID of the file associated with the documents.
329
-
329
+ batch_size (int): The size of the batches that will be pushed to the db. This value defaults to 100
330
+ as a balance between the memory impact of large files and performance improvements from batching.
330
331
Returns:
331
332
List[str]: A list of IDs assigned to the added documents.
332
-
333
333
Raises:
334
334
Any exceptions that may occur during the execution of the method.
335
-
336
335
"""
337
- ids = [] # Initialize the ids list
336
+ ids = []
338
337
embeddings = await self .embeddings .aembed_documents (
339
338
texts = [document .page_content for document in documents ]
340
339
)
341
340
341
+ vectors = []
342
342
for document , embedding in zip (documents , embeddings ):
343
- response = await self ._aadd_vector (
344
- vector_store_id = vector_store_id ,
345
- file_id = file_id ,
346
- content = document .page_content ,
347
- metadata = document .metadata ,
348
- embedding = embedding ,
343
+ vector = {
344
+ "content" : document .page_content ,
345
+ "metadata" : document .metadata ,
346
+ "embedding" : embedding ,
347
+ }
348
+ vectors .append (vector )
349
+
350
+ for i in range (0 , len (vectors ), batch_size ):
351
+ batch = vectors [i : i + batch_size ]
352
+ response = await self ._aadd_vectors (
353
+ vector_store_id = vector_store_id , file_id = file_id , vectors = batch
349
354
)
350
- ids .append ( response [ 0 ] ["id" ])
355
+ ids .extend ([ item ["id" ] for item in response ])
351
356
352
357
return ids
353
358
@@ -418,39 +423,34 @@ async def _adelete_vector(
418
423
)
419
424
return response
420
425
421
- async def _aadd_vector (
422
- self ,
423
- vector_store_id : str ,
424
- file_id : str ,
425
- content : str ,
426
- metadata : str ,
427
- embedding : list [float ],
426
+ async def _aadd_vectors (
427
+ self , vector_store_id : str , file_id : str , vectors : list [dict [str , any ]]
428
428
) -> dict :
429
- """Add a vector to the vector store.
429
+ """Add multiple vectors to the vector store in a batch .
430
430
431
431
Args:
432
432
vector_store_id (str): The ID of the vector store.
433
- file_id (str): The ID of the file associated with the vector.
434
- content (str): The content of the vector.
435
- metadata (str): The metadata associated with the vector.
436
- embedding (list[float]): The embedding of the vector.
433
+ file_id (str): The ID of the file associated with the vectors.
434
+ vectors (list[dict]): A list of dictionaries containing vector data.
437
435
438
436
Returns:
439
- dict: The response from the database after inserting the vector.
440
-
437
+ dict: The response from the database after inserting the vectors.
441
438
"""
442
-
443
439
user_id : str = (await self .db .auth .get_user ()).user .id
444
440
445
- row : dict [str , any ] = {
446
- "user_id" : user_id ,
447
- "vector_store_id" : vector_store_id ,
448
- "file_id" : file_id ,
449
- "content" : content ,
450
- "metadata" : metadata ,
451
- "embedding" : embedding ,
452
- }
453
- data , _count = await self .db .from_ (self .table_name ).insert (row ).execute ()
441
+ rows = []
442
+ for vector in vectors :
443
+ row = {
444
+ "user_id" : user_id ,
445
+ "vector_store_id" : vector_store_id ,
446
+ "file_id" : file_id ,
447
+ "content" : vector ["content" ],
448
+ "metadata" : vector ["metadata" ],
449
+ "embedding" : vector ["embedding" ],
450
+ }
451
+ rows .append (row )
452
+
453
+ data , _count = await self .db .from_ (self .table_name ).insert (rows ).execute ()
454
454
455
455
_ , response = data
456
456
0 commit comments