From 4407f4f3050ae285ed6f8710962d440688fa69a9 Mon Sep 17 00:00:00 2001 From: Riley Viktoria Louvier Date: Thu, 18 May 2023 15:18:38 -0600 Subject: [PATCH] Feat: Change ingest-data.ts to handle langchain limitations for larger upserts --- scripts/ingest-data.ts | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/scripts/ingest-data.ts b/scripts/ingest-data.ts index b729661..7413ef0 100644 --- a/scripts/ingest-data.ts +++ b/scripts/ingest-data.ts @@ -8,9 +8,29 @@ import { PINECONE_INDEX_NAME, PINECONE_NAME_SPACE } from '@/config/pinecone'; /* Name of directory to retrieve files from. You can change this as required */ const directoryPath = 'Notion_DB'; +async function ingestData(index: string, docs: any[], embeddings: any[], chunkSize: number) { + for (let i = 0; i < docs.length; i += chunkSize) { + const chunk = docs.slice(i, i + chunkSize); + try { + await PineconeStore.fromDocuments( + index, + chunk, + embeddings, + 'text', + PINECONE_NAME_SPACE, // optional namespace for your vectors + ); + console.log(`Successfully ingested chunk ${i / chunkSize + 1}`); + } catch (error) { + console.error(`Error ingesting chunk ${i / chunkSize + 1}:`, error); + // Handle the error as needed + throw new Error('Failed to ingest your data'); + } + } +} + export const run = async () => { try { - /*load raw docs from the markdown files in the directory */ + /* Load raw docs from the markdown files in the directory */ const rawDocs = await processMarkDownFiles(directoryPath); /* Split text into chunks */ @@ -23,16 +43,12 @@ export const run = async () => { console.log('split docs', docs); console.log('creating vector store...'); - /*create and store the embeddings in the vectorStore*/ + /* Create and store the embeddings in the vectorStore */ const embeddings = new OpenAIEmbeddings(); - const index = pinecone.Index(PINECONE_INDEX_NAME); //change to your own index name - await PineconeStore.fromDocuments( - index, - docs, - embeddings, - 'text', - PINECONE_NAME_SPACE, //optional namespace for your vectors - ); + const index = pinecone.Index(PINECONE_INDEX_NAME); // change to your own index name + + await ingestData(index, docs, embeddings, 1000); + } catch (error) { console.log('error', error); throw new Error('Failed to ingest your data');