-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
164 lines (130 loc) · 13.4 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os
from dotenv import load_dotenv
import openai
import PyPDF2
import numpy as np
from pymongo import MongoClient
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
mongdb_username = os.getenv("MONGODB_USERNAME")
mongodb_password = os.getenv("MONGODB_PASSWORD")
mongodb_connection_string = f"mongodb+srv://{mongdb_username}:{mongodb_password}@cluster0.ya56g.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
client = MongoClient(mongodb_connection_string)
db = client["pdf_rag"]
collection = db["papers"]
# Extract text from a PDF file
def extract_text_from_pdf(pdf_path):
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
return " ".join(page.extract_text() for page in reader.pages)
# split text into chunks
def split_text(text, chunk_size=500, overlap=100):
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start = end - overlap # maintain overlap between the chunks
return chunks
# generate embeddings for text chunks
# def generate_embeddings(text_chunks):
# embeddings = []
# for chunk in text_chunks:
# response = openai.embeddings.create(
# model="text-embedding-3-small",
# input=chunk
# )
# embeddings.append(response.data[0].embedding)
# return embeddings
# insert embeddings into MongoDB
# def insert_embeddings_into_mongodb(text_chunks, embeddings):
# for i, (chunk, embedding) in enumerate(zip(text_chunks, embeddings)):
# document = {
# "_id": f"chunk-{i}",
# "text": chunk,
# "embedding": embedding
# }
# collection.insert_one(document)
# generate embeddigns and store it in mongodb
def generate_and_store_embeddings(text_chunks):
for i, chunk in enumerate(text_chunks):
# check if the embedding already exists in the database
existing_doc = collection.find_one({"text": chunk})
if existing_doc:
print(f"chunk {i} already exists in the database. Skipping...")
continue
# if not, generate and store the embedding
print(f"Generating embedding for chunk {i}...")
response = openai.embeddings.create(
model="text-embedding-3-small",
input=chunk
)
embedding = response.data[0].embedding
# insert the chunk and its embedding into MongoDB
document = {
"_id": f"chunk-{i}",
"text": chunk,
"embedding": embedding
}
collection.insert_one(document)
# query MongoDB to retrieve relevant chunks
def query_mongodb(query, top_k=5):
# generate embedding in query
response = openai.embeddings.create(
model="text-embedding-3-small",
input=query
)
query_embedding = response.data[0].embedding
# compute similarity and retrieve top_k results
documents = list(collection.find())
similarities = []
for doc in documents:
embedding = np.array(doc["embedding"])
similarity = np.dot(query_embedding, embedding) / \
(np.linalg.norm(query_embedding) * np.linalg.norm(embedding))
similarities.append((doc["text"], similarity))
# sort by similarity and return top_k results
similarities.sort(key=lambda x: x[1], reverse=True)
return [text for text, _ in similarities[:top_k]]
# use GPT-4 to generate an answer
def generate_answer(context, query):
response = openai.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a helpful assistant knowledgable in biology, evolution and science."},
{"role": "user", "content": f"\n{context}\n\nQuestion:{query}\nAnswer:"}
],
temperature=0.2,
max_tokens=300
)
return (response.choices[0].message.content).strip()
# full pipeline
def main(pdf_path, user_query):
# Extract and process text from PDF...
print("Extracting text from PDF...")
pdf_text = extract_text_from_pdf(pdf_path)
print("Splitting text into chunks...")
text_chunks = split_text(pdf_text)
# Generate embeddings and store in database
print("Working on embeddings")
chunk_embeddings = generate_and_store_embeddings(text_chunks)
# Inserting embeddings into MongoDB
# print("Inserting embeddings into MongoDB...")
# insert_embeddings_into_mongodb(text_chunks, chunk_embeddings)
print("Querying MongoDB...")
relevant_chunks = query_mongodb(user_query)
context = " ".join(relevant_chunks)
print("Generating answer with GPT-4...")
answer = generate_answer(context, user_query)
print("\n Answer")
print(answer)
if __name__ == "__main__":
pdf_path = "paper1.pdf"
user_query = "What is the Darwinian fitness?"
main(pdf_path, user_query)
# --------DEBUG-------
# text = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."
# chunks = split_text(text)
# print("chunks[0]", chunks[0])
# embeddings = generate_embeddings("I am abhishek sharma.")
# print("embeddings ", embeddings)