-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathingestion.py
41 lines (34 loc) · 1.51 KB
/
ingestion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from dotenv import load_dotenv
import os
from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone
load_dotenv()
if __name__ == "__main__":
print("Going to ingest pinecone documentation...")
from llama_index.readers.file import UnstructuredReader
dir_reader = SimpleDirectoryReader(
input_dir="./llamaindex-docs",
file_extractor={".html": UnstructuredReader()},
)
documents = dir_reader.load_data()
node_parser = SimpleNodeParser.from_defaults(chunk_size=500, chunk_overlap=20)
llm = OpenAI(model="gpt-3.5-turbo", temperature=0)
embed_model = OpenAIEmbedding(model="text-embedding-3-small", embed_batch_size=100)
Settings.llm = OpenAI()
Settings.embed_model = OpenAIEmbedding()
index_name = "llamaindex-documentation-helper"
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
pinecone_index = pc.Index(name=index_name)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
documents=documents,
storage_context=storage_context,
show_progress=True,
)
print("finished ingesting...")