-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathhuxley.py
376 lines (312 loc) · 14.7 KB
/
huxley.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
import os
import time
import base64
import logging
import tempfile
import requests
import tiktoken
import pinecone
from io import BytesIO
import streamlit as st
from PyPDF2 import PdfReader
from dotenv import load_dotenv
from langchain.llms import OpenAI # type: ignore
from pdf2image import convert_from_bytes
from langchain.vectorstores import FAISS
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import get_openai_callback
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from streamlit_extras.add_vertical_space import add_vertical_space
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, PyMuPDFLoader, OnlinePDFLoader
loader = PyPDFLoader('docs/white_paper.pdf')
pages = []
print(loader)
from templates.qa_prompt import QA_PROMPT
from templates.condense_prompt import CONDENSE_PROMPT
load_dotenv()
logging.basicConfig(level=logging.DEBUG)
config = st.set_page_config(page_title='HuxleyPDF | by Fred Siika', page_icon='🗂', layout='wide')
# index = 'huxleypdf'
# openai_api_key=os.environ['OPENAI_API_KEY']
def check_openai_api_key():
st.info("Please add your OpenAI API key to begin.")
openai_api_key = st.text_input("OpenAI API Key", type="password")
if not openai_api_key:
st.stop()
return False
else:
os.environ['OPENAI_API_KEY'] = openai_api_key
st.success("API key set: " + openai_api_key[:5] + "..." + openai_api_key[-5:])
return True
def check_pinecone_api_key():
st.info("Please add your Pinecone API key to continue.")
pinecone_api_key = st.text_input("Pinecone API Key", type="password")
if not pinecone_api_key:
st.stop()
return False
else:
os.environ['PINECONE_API_KEY'] = pinecone_api_key
st.success("API key set: " + pinecone_api_key[:5] + "..." + pinecone_api_key[-5:])
return True
def check_pinecone_index():
st.info("Please add your Pinecone index to continue to begin. If you don't have one use the demo `huxleypdf`")
pinecone_index = st.text_input("Pinecone Index")
if not pinecone_index:
st.stop()
return False
else:
os.environ['PINECONE_INDEX'] = pinecone_index
st.success("Index set: " + pinecone_index)
return True
def check_pinecone_namespace():
st.info("Please add your Pinecone namespace to continue. If you don't have one use the demo `ns1`")
pinecone_namespace = st.text_input("Pinecone Namespace")
if not pinecone_namespace:
st.stop()
return False
else:
os.environ['PINECONE_NAMESPACE'] = pinecone_namespace
st.success("Namespace set: " + pinecone_namespace)
return True
def render_header():
# Start Top Information
st.title('🗂 HuxleyPDF')
col1, col2 = st.columns(2)
with col1:
st.markdown(("### LLM Assisted Custom Knowledgebase "
"\n\n"
"HuxleyPDF is a Python application that allows you to upload a PDF and ask questions about it using natural language."
"\n\n"
"#### How it works "
"\n\n"
"Upload personal docs and Chat with your PDF files with this GPT4-powered app. "
"\n\n"
"This tool is powered by [OpenAI](https://openai.com)"
"[LangChain](<https://langchain.com/>), and [OpenAI](<https://openai.com>) and made by "
"[@fredsiika](<https://twitter.com/fredsiika>)."
"\n\n"
"View Source Code on [Github](<https://github.com/fredsiika/huxley-pdf/blob/main/huxley.py>)"
))
with col2:
st.image(image='huxleychat_banner.png', width=300, caption='Tutorial and accompanying documentation coming soon.')
# End Top Information
return
# Function to set up the environment
def setup_environment():
print('Setting up environment')
# connect_to_pinecone(index)
def connect_to_pinecone(index_name):
"""Connect to Pinecone and return the index."""
# find API key in console at app.pinecone.io
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') # or 'PINECONE_API_KEY'
# find ENV (cloud region) next to API key in console
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') # or 'PINECONE_ENVIRONMENT'
openai_model= 'gpt-3.5-turbo'
temperature = 0.5
# initialize pinecone
pinecone.init(
api_key=PINECONE_API_KEY, # find at app.pinecone.io
environment=PINECONE_ENVIRONMENT # next to api key in console
)
model = ChatOpenAI(
model_name=openai_model,
temperature=temperature,
openai_api_key=os.getenv("OPENAI_API_KEY"),
streaming=False
) # max temperature is 2 least is 0
# only create index if it doesn't exist
if index_name not in pinecone.list_indexes():
pinecone.create_index(
name=index_name,
dimension=model.get_sentence_embedding_dimension(),
metric='cosine'
)
# now connect to the index
print(f"Connecting to Pinecone..\nindex_name: {index_name}")
index = pinecone.GRPCIndex(index_name)
# wait a moment for the index to be fully initialized
time.sleep(1)
loader = PyMuPDFLoader("./docs/white_paper.pdf")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=20)
docs = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
# if you already have an index, you can load it like this
docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)
query = "Why did the chicken cross the road?"
docs = docsearch.similarity_search(query)
print(f'\n{docs[0].page_content}\n')
# print(f"\nClients connected to Pinecone index {index_name} \n{index.describe_index_stats()}\n")
return index.describe_index_stats()
def clear_submit():
st.session_state["submit"] = False
def sidebar():
with st.sidebar:
st.markdown('''## About HuxleyPDF''')
st.markdown('''
HuxleyPDF is a Python application that allows you to upload a PDF and ask questions about it using natural language.
## How it works:
Upload personal docs and Chat with your PDF files with this GPT4-powered app.
Built with [LangChain](https://docs.langchain.com/docs/), [Pinecone Vector Db](https://pinecone.io/), deployed on [Streamlit](https://streamlit.io)
## How to use:
1. Upload a PDF
2. Ask a question about the PDF
3. Get an answer about the PDF
4. Repeat
## Before you start using HuxleyPDF:
- You need to have an OpenAI API key. You can get one [here](https://api.openai.com/).
- You need to have a Pinecone API key. You can get one [here](https://www.pinecone.io/).
- You need to have a Pinecone environment. You can create one [here](https://www.pinecone.io/).
## How to obtain your OpenAI API key:
1. Sign in to your OpenAI account. If you do not have an account, [click here](https://platform.openai.com/signup) to sign up.
2. Visit the [OpenAI API keys page.](https://platform.openai.com/account/api-keys)
open-key-create
![Step 1 and 2 Create an API Key Screenshot](https://www.usechatgpt.ai/assets/chrome-extension/open-key-create.png)
3. Create a new secret key and copy & paste it into the "API key" input field below.👇🏾
''')
st.markdown('''
## OpenAI API key
**Tips:**
- The official OpenAI API is more stable than the ChatGPT free plan. However, charges based on usage do apply.
- Your API Key is saved locally on your browser and not transmitted anywhere else.
- If you provide an API key enabled with GPT-4, the extension will support GPT-4.
- Your free OpenAI API key could expire at some point, therefore please check [the expiration status of your API key here.](https://platform.openai.com/account/usage)
- Access to ChatGPT may be unstable when demand is high for free OpenAI API key.
''')
add_vertical_space(5)
st.write('[HuxleyPDF](https://github.com/fredsiika/huxley-pdf) was made with ❤️ by [Fred](https://github.com/fredsiika)')
st.write(
"openai_api_key set: ",
check_openai_api_key()
# f'<span style="color:green;">{True}</span>' if os.environ.get('OPENAI_API_KEY') else f'<span style="color:red;">{False}</span>'
)
st.write(
"pinecone_api set: ",
check_pinecone_api_key()
# True if os.environ.get('PINECONE_API_KEY') == st.secrets['PINECONE_API_KEY'] else False
)
st.write(
"pinecone_index set set:",
check_pinecone_index()
# os.environ.get('PINECONE_INDEX') == st.secrets['PINECONE_INDEX'],
)
st.write(
'pinecone_namespace set: ',
check_pinecone_namespace()
# os.environ.get('PINECONE_NAMESPACE') == st.secrets['PINECONE_NAMESPACE'],
)
# st.write(
# "pinecone_environment set: ",
# # os.environ.get('PINECONE_ENVIRONMENT') == st.secrets['PINECONE_ENVIRONMENT'],
# )
def upload_files():
uploaded_files = st.file_uploader(
"Upload multiple files",
type="pdf",
help="docs, and txt files are still in beta.",
accept_multiple_files=True,
on_change=clear_submit
)
if uploaded_files is None:
st.info("Please upload a file of type: " + ", ".join(["pdf"]))
return uploaded_files
# To get the tokenizer corresponding to a specific model in the OpenAI API:
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo') # specific tiktoken encoder which is used by gpt-3.5-turbo: https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L74
def tiktoken_len(text):
"""Returns the length of the text in tokens."""
tokens = tokenizer.encode(
text,
disallowed_special=()
)
return len(tokens)
# Function to ingest the files
def ingest_files(uploaded_files):
# find API key in console at app.pinecone.io
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') # or 'PINECONE_API_KEY'
# find ENV (cloud region) next to API key in console
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') # or 'PINECONE_ENVIRONMENT'
try:
with st.spinner("Indexing documents... this might take a while⏳"):
# Code to ingest the files goes here...
with tempfile.TemporaryDirectory() as tmpdir:
for uploaded_file in uploaded_files:
file_name = uploaded_file.name
file_content = uploaded_file.read()
st.write("Filename: ", file_name)
with open(os.path.join(tmpdir, file_name), "wb") as file:
file.write(file_content)
loader = DirectoryLoader(tmpdir, glob="**/*.pdf", loader_cls=PyMuPDFLoader) # type: ignore
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100, length_function=tiktoken_len)
documents = text_splitter.split_documents(documents)
pinecone.init(
api_key=PINECONE_API_KEY, # find at app.pinecone.io
environment=PINECONE_ENVIRONMENT # next to api key in console
)
openai_api_key = os.getenv('OPENAI_API_KEY')
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=openai_api_key, client=None)
# Pinecone.from_documents(documents, embeddings, index_name=index_name, namespace='ns1')
Pinecone.from_existing_index(index_name='huxleypdf', embedding=embeddings, namespace='ns1')
st.success("Ingested File!")
st.session_state["api_key_configured"] = True
except Exception as e:
st.error(f"Error while ingesting the files: {str(e)}")
return None
# Function to display PDF as image on mobile devices
def show_pdf_as_image(pdf_bytes):
images = convert_from_bytes(pdf_bytes)
for image in images:
st.image(image)
# Function to display PDF as iFrame on desktop
def show_pdf_as_iframe(file):
if file is not None:
pdf_bytes = file.read()
base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="500" height="500" type="application/pdf"></iframe>'
st.markdown(pdf_display, unsafe_allow_html=True)
pdf_reader = PdfReader(file)
def main():
render_header()
sidebar()
# setup_environment()
# Upload file
pdf = st.file_uploader("Upload your PDF", type="pdf")
# Fetching remote PDFs using Unstructured
# loader = OnlinePDFLoader("https://arxiv.org/pdf/2302.03803.pdf")
# data = loader.load()
# print(data)
# extract the text
if pdf is not None:
pdf_reader = PdfReader(pdf)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
# Split into chunks
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=400,
chunk_overlap=80, # I usually set chunk_overlap == 20% of chunk_size
length_function=len
)
chunks = text_splitter.split_text(text)
# create embeddings
embeddings = OpenAIEmbeddings()
#TODO: render image of pdf
# show_pdf_as_iframe(pdf)
knowledge_base = Pinecone.from_existing_index(index_name='huxleypdf', embedding=embeddings, namespace='ns1')
# show user input
user_question = st.text_input("Ask a question about your PDF: ")
if user_question:
docs = knowledge_base.similarity_search(user_question)
llm = OpenAI()
chain = load_qa_chain(llm, chain_type="stuff")
with get_openai_callback() as cb:
response = chain.run(input_documents=docs, question=user_question)
print(cb)
st.write(response)
#TODO: Add error handling
if __name__ == '__main__':
main()