-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtools.py
248 lines (170 loc) · 9.5 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
from Config import*
## this fucntion below reads files
def read_file(file_path):
"""Read the content of a Markdown or PDF file."""
if file_path.endswith('.pdf'):
text=process_pdf(file_path, save=False)
return text
elif file_path.endswith('.md') or file_path.endswith('.markdown') or file_path.endswith(".txt") or file_path.endswith(".py"):
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
else:
raise ValueError("Unsupported file type. Please provide a Markdown or PDF file.")
# this function was intended for chapter finding but could not use it due to complexity
def detect_table_of_contents(file):
"""Detect the Table of Contents in the document."""
# Sample TOCEntry class definition
class TOCEntry:
def __init__(self, level, title, page):
self.level = level
self.title = title
self.page = page
# Function to extract chapter titles from a PDF file
def get_chapter_titles_from_pdf(filename):
# Extract TOC entries using the provided filename
toc_entries = extract_toc(filename)
# Extracting chapter titles using a list comprehension
chapter_titles = [entry.title for entry in toc_entries]
return chapter_titles
titles_list=get_chapter_titles_from_pdf(file)
print(f"\n\n titles list {titles_list} \n\n \n\n")
return titles_list
def split_text_into_sections(file):
"""Split text into chapters, pages, paragraphs, sentences, and words."""
text=read_file(file)
def split_text(text, delimiters):
"""Split text using multiple delimiters."""
# Create a regex pattern that matches any of the delimiters
pattern = '|'.join(map(re.escape, delimiters))
return re.split(pattern, text)
chapternames =detect_table_of_contents(file) # List of chapters already given for making it fast
chapters = split_text(text,chapternames) # deactivate if not using the Biochem.md or rb.md
#chapters=text.split('----') # activate if not using the Biochem.md
graph = nx.Graph()
stop_words = set(stopwords.words('english')) # Load English stopwords
def process_chapter(chapter):
"""Process a single chapter into pages, paragraphs, sentences, and words."""
start=time.time()
pattern = r'(-{3,4}|\n\n)'
pages = chapter.split(pattern) # Assuming pages are separated by double newlines
for page in pages:
paragraphs = re.split(r'\n+', page) # Split into paragraphs
for paragraph in paragraphs:
#print(paragraph) # Paragraph
sentences = sent_tokenize(paragraph) # Split into sentences using NLTK
for sentence in sentences:
words = word_tokenize(sentence) # Split into words using NLTK
filtered_words = [word for word in words if word.lower() not in stop_words] # Remove stopwords
# Create nodes in the graph
graph.add_node(sentence)
sentence_embedding = get_embedding(sentence)
graph.nodes[sentence]['embedding'] = sentence_embedding # Store embedding in the graph
for word in filtered_words:
graph.add_node(word)
graph.add_edge(sentence, word) # Connect sentence to its words
# Extract keywords using RAKE
r = Rake()
r.extract_keywords_from_text(sentence)
keywords = r.get_ranked_phrases()
graph.nodes[sentence]['keywords'] = keywords # Store keywords in the graph
for keyword in keywords:
graph.add_node(keyword)
keyword_embedding = get_embedding(keyword)
graph.nodes[keyword]['embedding'] = keyword_embedding # Store embedding in the graph
graph.add_edge(sentence, keyword) # Connect sentence to its keywords
graph.add_edge(page, paragraph) # Connect page to its paragraphs
graph.add_edge(chapter, page) # Connect chapter to its pages
end=time.time()
looptime=(end-start)
print(f"------------- \n\n chapter completed in {looptime} \n\n -------------")
# Use multithreading to process chapters
cpu_count = os.cpu_count()
max_threads=cpu_count*30
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
futures = [executor.submit(process_chapter, chapter) for chapter in chapters]
for future in concurrent.futures.as_completed(futures):
try:
future.result() # Wait for the chapter processing to complete
except Exception as e:
print(f"Error processing chapter: {e}")
return graph
def process_pdfs_in_folder(folder_path, save):
#graphs = [] # List to hold the resulting graphs
saves = []
# Progress bar for processing PDFs
pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')] # Filter out only PDFs
for filename in tqdm(pdf_files, desc="Processing PDFs"):
full_path = os.path.join(folder_path, filename) # Get full file path
graph = split_text_into_sections(full_path) # Process the PDF
print(f"\n\n ------------- \n\n Graph creation complete for {filename} \n\n -------------- \n\n ")
# Remove the '.pdf' extension and save as .gml
filename_no_ext = os.path.splitext(filename)[0] # Get the file name without the extension
savepath = os.path.join(save, f"{filename_no_ext}.gml") # Save as .gml
save_graph(graph, savepath)
saves.append(savepath) # Store the exact savepath
msg=print(f"all books processed, saved at {saves}")
return msg
# If there is a library of books
# GraphRAG takes a lot of time to calculate on big books so we will save the graphs as yaml
def save_graph(graph, filepath):
"""Save the graph to a specified file path using pickle."""
save_gml(graph,filepath)
print(f"Graph saved to {filepath} as gml file")
def load_graph(filepath):
"""Load the graph from a specified file path using pickle."""
graph=read_gml(filepath)
print(f"Graph loaded from {filepath}")
return graph
# The embedding Function
def get_embedding(text, model="mxbai-embed-large"):
"""Get embedding for a given text using Ollama API."""
response = ollama.embeddings(model=model, prompt=text)
return response["embedding"]
# This function below gets the similarity of keywords in question with the huge text
def calculate_cosine_similarity(chunk, query_embedding, embedding):
"""Calculate cosine similarity between a chunk and the query."""
if np.linalg.norm(query_embedding) == 0 or np.linalg.norm(embedding) == 0:
return (chunk, 0) # Handle zero vectors
cosine_sim = np.dot(query_embedding, embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(embedding))
return (chunk, cosine_sim)
# The Retrival portion of the graphrag
def find_most_relevant_chunks(query, graph,k=25):
"""Find the most relevant chunks based on the graph and cosine similarity to the query."""
# Step 1: Extract keywords from the query using RAKE
r = Rake()
r.extract_keywords_from_text(query)
keywords = r.get_ranked_phrases()
# Step 2: Find relevant sentences in the graph based on keywords
relevant_sentences = set()
for keyword in keywords:
for node in graph.nodes():
if keyword.lower() in node.lower(): # Check if keyword is in the node
relevant_sentences.add(node) # Add the whole sentence
# Step 3: Calculate embeddings for relevant sentences
similarities = {}
query_embedding = get_embedding(query)
for sentence in relevant_sentences:
if sentence in graph.nodes:
embedding = graph.nodes[sentence].get('embedding')
if embedding is not None:
cosine_sim = calculate_cosine_similarity(sentence, query_embedding, embedding)
similarities[sentence] = cosine_sim[1] # Store only the similarity score
# Sort sentences by similarity
sorted_sentences = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
return sorted_sentences[:k] # Return top k relevant sentences
def answer_query(query: str, graph, Ucontext: Optional[List[str]] = None,model='mistral-nemo:latest', k: int = 25):
"""Answer a query using the graph and embeddings, combining both system context and user-provided context."""
# Find the most relevant chunks from the graph
relevant_chunks = find_most_relevant_chunks(query, graph, k=k)
# Join the relevant chunks from the graph into a single string
graph_context = " ".join(chunk for chunk, _ in relevant_chunks)
# If Ucontext is provided, join it with the graph context
if Ucontext:
user_context = " ".join(Ucontext)
context = f"{user_context} {graph_context}" # Combine user context and graph context
else:
context = graph_context
# Generate the response using the combined context
response = ollama.generate(model=model, prompt=f"based on the Context: {context} answer the Question: {query}")
# Return the response if available, otherwise return a default message
return response.get('response', "No answer generated.")