-
Notifications
You must be signed in to change notification settings - Fork 7
/
ingest_knowledge.py
102 lines (73 loc) · 3.18 KB
/
ingest_knowledge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Knowledge Ingestion of the markdown files in the docs folder of the llama_index repository:
- https://github.com/jerryjliu/llama_index
Built with LlamaIndex, GithubRepositoryReader, and OpenAI.
Author:
@dcarpintero : https://github.com/dcarpintero
"""
from llama_index import download_loader, VectorStoreIndex
from llama_index.node_parser import SimpleNodeParser
from llama_hub.github_repo import GithubRepositoryReader, GithubClient
from dotenv import load_dotenv
import openai
import os
import logging
def load_environment_vars() -> dict:
"""Load required environment variables. Raise an exception if any are missing."""
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
github_token = os.getenv("GITHUB_TOKEN")
if not api_key:
raise EnvironmentError("OPENAI_API_KEY environment variable not set.")
if not github_token:
raise EnvironmentError("GITHUB_TOKEN environment variable not set.")
logging.info("Environment variables loaded.")
return {"OPENAI_API_KEY": api_key, "GITHUB_TOKEN": github_token}
def initialize_github_loader(github_token: str) -> GithubRepositoryReader:
"""Initialize GithubRepositoryReader"""
download_loader("GithubRepositoryReader")
github_client = GithubClient(github_token)
loader = GithubRepositoryReader(
github_client,
owner = "jerryjliu",
repo = "llama_index",
filter_directories = (["docs"], GithubRepositoryReader.FilterType.INCLUDE),
filter_file_extensions = ([".md"], GithubRepositoryReader.FilterType.INCLUDE),
verbose = False,
concurrent_requests = 10,
)
return loader
def load_and_index_data(loader) -> VectorStoreIndex:
"""Load and Index Knowledge Base from GitHub Repository"""
docs = load_data(loader)
index = index_data(docs)
return index
def load_data(loader: GithubRepositoryReader) -> []:
"""Load Knowledge Base from GitHub Repository"""
logging.info("Loading data from Github: %s/%s", loader._owner, loader._repo)
docs = loader.load_data(branch="main")
for doc in docs:
logging.info(doc.extra_info)
doc.metadata = {'filename': doc.extra_info['file_name'], 'author': "LlamaIndex"}
return docs
def index_data(docs: []) -> VectorStoreIndex:
"""Index Documents"""
logging.info("Parsing documents into nodes...")
parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=32)
nodes = parser.get_nodes_from_documents(docs)
logging.info("Indexing nodes...")
index = VectorStoreIndex(nodes)
logging.info("Persisting index on ./storage...")
index.storage_context.persist(persist_dir="./storage")
logging.info("Data-Knowledge ingestion process is completed (OK)")
return index
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
try:
env_vars = load_environment_vars()
openai.api_key = env_vars['OPENAI_API_KEY']
loader = initialize_github_loader(env_vars['GITHUB_TOKEN'])
load_and_index_data(loader)
except Exception as ex:
logging.error("Unexpected Error: %s", ex)
raise ex