-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathlocal_genai_search_ollama.py
295 lines (248 loc) Β· 10.5 KB
/
local_genai_search_ollama.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
import os
import faiss
import numpy as np
import PyPDF2
import docx
from pptx import Presentation
import json
import streamlit as st
import re
import ollama
from streamlit_lottie import st_lottie
import requests
print("Starting the application...")
# Global variables
dimension = 768 # Dimension for 'nomic-embed-text' model
index = faiss.IndexFlatIP(dimension)
metadata = []
print(f"Initialized FAISS index with dimension {dimension}")
def read_pdf(file_path):
print(f"Reading PDF: {file_path}")
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
return ' '.join([page.extract_text() for page in reader.pages])
def read_docx(file_path):
print(f"Reading DOCX: {file_path}")
doc = docx.Document(file_path)
return ' '.join([para.text for para in doc.paragraphs])
def read_pptx(file_path):
print(f"Reading PPTX: {file_path}")
prs = Presentation(file_path)
return ' '.join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, 'text')])
def chunk_text(text, chunk_size=500, overlap=50):
print(f"Chunking text of length {len(text)} with chunk size {chunk_size} and overlap {overlap}")
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = ' '.join(words[i:i + chunk_size])
chunks.append(chunk)
print(f"Created {len(chunks)} chunks")
return chunks
def embed_text(text):
response = ollama.embeddings(model='nomic-embed-text', prompt=text)
return response['embedding']
def index_documents(directory):
print(f"Indexing documents in directory: {directory}")
global metadata
documents = []
metadata = [] # Reset metadata
# Convert to absolute path
abs_directory = os.path.abspath(directory)
for root, _, files in os.walk(abs_directory):
for file in files:
file_path = os.path.join(root, file)
print(f"Processing file: {file_path}")
content = ""
if file.lower().endswith('.pdf'):
content = read_pdf(file_path)
elif file.lower().endswith('.docx'):
content = read_docx(file_path)
elif file.lower().endswith('.pptx'):
content = read_pptx(file_path)
elif file.lower().endswith('.txt'):
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
if content:
chunks = chunk_text(content)
for i, chunk in enumerate(chunks):
documents.append(chunk)
# Store both absolute and relative paths
rel_path = os.path.relpath(file_path, abs_directory)
metadata.append({
"abs_path": file_path,
"rel_path": rel_path,
"chunk_id": i,
"base_dir": abs_directory
})
print(f"Encoding {len(documents)} document chunks")
embeddings = [embed_text(doc) for doc in documents]
print(f"Adding embeddings to FAISS index")
index.add(np.array(embeddings))
# Save index and metadata
print("Saving FAISS index and metadata")
faiss.write_index(index, "document_index.faiss")
with open("metadata.json", "w") as f:
json.dump(metadata, f)
print(f"Indexed {len(documents)} document chunks.")
def read_document_chunk(file_path, chunk_id):
print(f"Reading document chunk: {file_path}, chunk_id: {chunk_id}")
content = ""
# Find the metadata entry for this file
matching_meta = None
for meta in metadata:
if meta["abs_path"] == file_path or meta["rel_path"] == os.path.basename(file_path):
matching_meta = meta
break
if matching_meta:
# Try both absolute path and reconstructed path
try_paths = [
matching_meta["abs_path"],
os.path.join(matching_meta["base_dir"], matching_meta["rel_path"])
]
for try_path in try_paths:
if os.path.exists(try_path):
file_path = try_path
break
else:
print(f"File not found: {file_path}")
return f"[Content not available for {os.path.basename(file_path)}]"
if file_path.endswith('.pdf'):
content = read_pdf(file_path)
elif file_path.endswith('.docx'):
content = read_docx(file_path)
elif file_path.endswith('.pptx'):
content = read_pptx(file_path)
elif file_path.endswith('.txt'):
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
chunks = chunk_text(content)
return chunks[chunk_id] if chunk_id < len(chunks) else ""
def semantic_search(query, k=10):
print(f"Performing semantic search for query: '{query}', k={k}")
query_vector = embed_text(query)
distances, indices = index.search(np.array([query_vector]), k)
results = []
for i, idx in enumerate(indices[0]):
meta = metadata[idx]
content = read_document_chunk(meta["abs_path"], meta["chunk_id"])
results.append({
"id": int(idx),
"path": meta["abs_path"],
"content": content,
"score": float(distances[0][i])
})
print(f"Found {len(results)} search results")
return results
def generate_answer(query, context):
print(f"Generating answer for query: '{query}'")
prompt = f"""Answer the user's question using the documents given in the context. In the context are documents that should contain an answer. Please always reference the document ID (in square brackets, for example [0],[1]) of the document that was used to make a claim. Use as many citations and documents as it is necessary to answer the question.
Context:
{context}
Question: {query}
Answer:"""
print("Sending prompt to Ollama")
response = ollama.generate(model='tinyllama', prompt=prompt)
print("Received response from Ollama")
return response['response']
def load_lottieurl(url: str):
r = requests.get(url)
if r.status_code != 200:
return None
return r.json()
def main():
global documents_path, index, metadata
print("Starting Streamlit UI")
# Page config
st.set_page_config(page_title="Local GenAI Search", page_icon="π", layout="wide")
# Custom CSS
st.markdown("""
<style>
.big-font {
font-size:30px !important;
font-weight: bold;
color: #1E90FF;
}
.stButton>button {
background-color: #4CAF50;
color: white;
font-size: 18px;
padding: 10px 24px;
border-radius: 12px;
border: none;
}
.stTextInput>div>div>input {
font-size: 16px;
}
</style>
""", unsafe_allow_html=True)
# Title and animation
col1, col2 = st.columns([2, 1])
with col1:
st.markdown('<p class="big-font">Local GenAI Search π</p>', unsafe_allow_html=True)
st.write("Explore your documents with the power of AI!")
with col2:
lottie_url = "https://assets5.lottiefiles.com/packages/lf20_fcfjwiyb.json"
lottie_json = load_lottieurl(lottie_url)
st_lottie(lottie_json, height=150, key="coding")
# Input for documents path
documents_path = st.text_input("π Enter the path to your documents folder:", "")
if documents_path:
documents_path = os.path.abspath(documents_path)
# Check if documents are indexed
if not os.path.exists("document_index.faiss"):
st.warning("β οΈ Documents are not indexed. Please run the indexing process first.")
if st.button("π Index Documents"):
with st.spinner("Indexing documents... This may take a while."):
print(f"Indexing documents in {documents_path}")
index_documents(documents_path)
st.success("β
Indexing complete!")
st.rerun()
# Load index and metadata if not already loaded
global index, metadata
if len(metadata) == 0:
print("Loading FAISS index and metadata")
index = faiss.read_index("document_index.faiss")
with open("metadata.json", "r") as f:
metadata = json.load(f)
print(f"Loaded index with {index.ntotal} vectors and {len(metadata)} metadata entries")
st.markdown("---")
st.markdown("## Ask a Question")
question = st.text_input("π€ What would you like to know about your documents?", "")
if st.button("π Search and Answer"):
if question:
with st.spinner("Searching and generating answer..."):
print(f"User asked: '{question}'")
# Perform semantic search
search_results = semantic_search(question)
# Prepare context for answer generation
context = "\n\n".join([f"{i}: {result['content']}" for i, result in enumerate(search_results)])
# Generate answer
answer = generate_answer(question, context)
st.markdown("### π€ AI Answer:")
st.markdown(answer)
# Display referenced documents
st.markdown("### π Referenced Documents:")
referenced_ids = set()
for match in re.finditer(r'\[(\d+)\]', answer):
try:
doc_id = int(match.group(1))
if doc_id < len(search_results):
referenced_ids.add(doc_id)
except ValueError:
continue
print(f"Displaying {len(referenced_ids)} referenced documents")
for doc_id in referenced_ids:
doc = search_results[doc_id]
with st.expander(f"π Document {doc_id} - {os.path.basename(doc['path'])}"):
st.write(doc['content'])
st.write(f"Source: {doc['path']}")
if os.path.exists(doc['path']):
with open(doc['path'], 'rb') as f:
st.download_button("β¬οΈ Download file", f, file_name=os.path.basename(doc['path']))
else:
st.warning(f"β οΈ File not found: {doc['path']}")
else:
st.warning("β οΈ Please enter a question before clicking 'Search and Answer'.")
if __name__ == "__main__":
main()
print("Application finished")