-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
108 lines (89 loc) · 3.76 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import sys
import os
import streamlit as st
import codecs
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain_community.chat_models import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
# Function to read PDF content
def read_pdf(file):
pdf_reader = PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# Function to read HTML content
def read_html(file):
html_content = file.getvalue().decode("utf-8")
soup = BeautifulSoup(html_content, "html.parser")
# Extract text from HTML
text = soup.get_text(separator=" ")
return text
# Load environment variables
load_dotenv()
# Main Streamlit app
def main():
st.title("Query your PDF or HTML")
with st.sidebar:
st.title('Ai Scholar')
st.markdown('''
## About
Choose the desired PDF or HTML file, then perform a query.
''')
# File uploader for uploading PDFs or HTML files
uploaded_file = st.file_uploader("Upload PDF or HTML", type=["pdf", "html"])
if uploaded_file is None:
st.info("Please upload a file of type: " + ", ".join(["pdf", "html"]) + " to start analysing your data.")
st.image("waitingForScholar.webp", use_column_width=True)
return
if uploaded_file:
if uploaded_file.type == "application/pdf":
text = read_pdf(uploaded_file)
elif uploaded_file.type == "text/html":
text = read_html(uploaded_file)
else:
st.error("Unsupported file format. Please upload a PDF or HTML file.")
return
st.info("The content of the file is hidden. Type your query in the chat window.")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=150,
length_function=len
)
# Process the text and create the documents list
documents = text_splitter.split_text(text=text)
# Vectorize the documents and create vectorstore
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_texts(documents, embedding=embeddings)
st.session_state.processed_data = {
"document_chunks": documents,
"vectorstore": vectorstore,
}
# Load the Langchain chatbot
llm = ChatOpenAI(temperature=0, max_tokens=1000, model_name="gpt-3.5-turbo")
qa = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever())
# Initialize Streamlit chat UI
if "messages" not in st.session_state:
st.session_state.messages = []
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
if prompt := st.chat_input(f"Ask your questions from {uploaded_file.name}?"):
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
result = qa({"question": prompt, "chat_history": [(message["role"], message["content"]) for message in st.session_state.messages]})
with st.chat_message("assistant"):
message_placeholder = st.empty()
full_response = result["answer"]
message_placeholder.markdown(full_response + "|")
message_placeholder.markdown(full_response)
st.session_state.messages.append({"role": "assistant", "content": full_response})
if __name__ == "__main__":
main()