-
Notifications
You must be signed in to change notification settings - Fork 0
/
query_processing.py
381 lines (321 loc) · 14.5 KB
/
query_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
import os
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import customtkinter as ctk
import tkinter as tk
def extract_indexes():
"""
This function is used to extract the inverted index and the positional index from their respective text files.
Returns:
inverted_index (dict): The extracted inverted index.
positional_index (dict): The extracted positional index.
"""
inverted_index = {}
positional_index = {}
with open('inverted_index.txt', 'r') as f:
while True:
element = f.readline() # read a line from the file
if element == '': # if the line is empty (which means end of file), break the loop
break
term, posting = element.split(':') # split the line at ':'. the first part of the split line is the term, and the second is the postings list
posting = posting.split() # split the positions at whitespace
posting = [int(c) for c in posting if c != '' and c != '\n'] # convert the positions to a list of integers and remove any empty strings or newline characters
inverted_index[term] = posting # add the term and its posting list to the inverted index
with open('positional_index.txt', 'r') as f: # do the same for the positional index
while True:
element = f.readline()
if element == '':
break
element = element.split(':')
term = element[0]
docID, positions = element[1].split('.')
positions = positions.split()
positions = [int(c) for c in positions if c != '' and c != '\n']
if term in positional_index: # if the term is already in the positional index, append the positions to the term's dictionary. Otherwise append both docID and the positions
positional_index[term][int(docID)] = positions
else:
positional_index[term] = {int(docID): positions}
return inverted_index, positional_index
def get_docIDs():
"""
This function is used to extract document IDs based on the names of the files in the 'ResearchPapers' directory.
It gets the current working directory and lists all the files in the 'ResearchPapers' directory.
It then extracts the document IDs from the names of these files, sorts them, and returns the sorted list.
Assumes the 'ResearchPapers' folder is in your current working directory.
Returns:
docID (list): A sorted list of document IDs extracted from the file names in the 'ResearchPapers' directory.
"""
curr_dir = os.getcwd() # get the current directory
docID = [int(c.rstrip('.txt')) for c in os.listdir(curr_dir + '\ResearchPapers')] # extract the docIDs from the names of the files in the ResearchPapers directory
docID.sort()
return docID
def get_stopwords():
"""
This function is used to extract stopwords from 'Stopword-List.txt' file.
It reads each line from the file, and if the line is not empty, it appends the line to the stopwords list.
The function continues this process until it reaches the end of the file. Assumes the file is in your current working directory.
Returns:
stopwords (list): A list of stopwords extracted from the file.
"""
stopwords = []
with open('Stopword-List.txt', 'r') as f: # the 'Stopword-List.txt' file is opened in read mode
while True:
text = f.readline() # each line from the file is read one by one
if not text: # if the line read is empty (which means end of file), the loop is broken
break
stopwords.append(text) # else append the read line to the stopwords list
stopwords = [c.rstrip(' \n') for c in stopwords if c != '\n'] # a new list is created from stopwords, excluding any newline characters. Newline characters are also removed from the strings.
return stopwords
def INTERSECTION(p1, p2):
"""
This function returns the intersection of two lists.
Args:
p1 (list): The first list.
p2 (list): The second list.
Returns:
result (list): A list containing the elements common to p1 and p2.
"""
result = [c for c in p1 if c in p2] # a new list is created from p1, containing only the elements that are also in p2
return result
def UNION(p1, p2):
"""
This function returns the union of two lists.
Args:
p1 (list): The first list.
p2 (list): The second list.
Returns:
result (list): A list containing the elements from both p1 and p2, without duplicates.
"""
# add the elements of p1 and p2 to a new list
result = p1
result.extend(p2)
result = list(set(result)) # convert the list to a set to remove duplicates, and then convert the set back to a list
return result
def NOT(p1):
"""
This function returns the elements that are in a predefined list but not in the input list.
Args:
p1 (list): The input list.
Returns:
result (list): A list containing the elements that are in 'doc' but not in p1.
"""
doc = get_docIDs() # get the docIDs
result = [c for c in doc if c not in p1] # use list comprehension to create a new list that contains only the elements that are in doc but not in p1
return result
def BoolQueryProcessing(query):
"""
This function processes the query and returns the result based on the type of query.
Args:
query (str): The query to be processed.
Returns:
result (list): The result of the query.
"""
query = query.split() # split the query into words
porter_stemmer = PorterStemmer() # initialize the stemmer
stopwords = get_stopwords() # get the stopwords
inverted_index, positional_index = extract_indexes() # extract the inverted index and positional index
for i, word in enumerate(query): # loop through each word in the query
if word in ['AND', 'OR', 'NOT']: # if the word is a boolean operator, skip it
continue
temp = porter_stemmer.stem(word) # stem the word
if temp[-1] == "'": # remove the apostrophe
temp = word.rstrip("'")
else:
query[i] = temp
if 'AND' in query: # if the query contains 'AND', 'OR' or 'NOT, split the query at that point and process the two parts separately
index = query.index('AND')
t1 = query[:index] # splitting the query into two parts
t2 = query[index+1:]
p1 = BoolQueryProcessing(' '.join(t1)) # combine the query into a string and recursively call the function to process the first part
p1 = p1.split() # split the result into a list
p1 = [int(c) for c in p1]
p2 = BoolQueryProcessing(' '.join(t2)) # combine the query into a string and recursively call the function to process the first part
p2 = p2.split()
p2 = [int(c) for c in p2]
result = INTERSECTION(p1, p2) # find the intersection of the results
elif 'OR' in query:
index = query.index('OR')
t1 = query[:index]
t2 = query[index+1:]
p1 = BoolQueryProcessing(' '.join(t1))
p1 = p1.split()
p1 = [int(c) for c in p1]
p2 = BoolQueryProcessing(' '.join(t2))
p2 = p2.split()
p2 = [int(c) for c in p2]
result = UNION(p1, p2) # find the union of the results
elif 'NOT' in query:
index = query.index('NOT')
t1 = query[index+1:]
p1 = BoolQueryProcessing(' '.join(t1))
p1 = p1.split()
p1 = [int(c) for c in p1]
result = NOT(p1)
else: # if the query contains only a single term
term = query[0] # extract the term
result = inverted_index.get(term, []) # get the postings list for the term from the inverted index. Will get an empty list if the term is not found
result = ''.join([str(c) + ' ' for c in result]) # convert the result to a string
return result
def ProxQueryProcessing(query):
"""
This function processes a proximity query.
Parameters:
query (str): The proximity query to be processed.
Returns:
str: A string of document IDs that satisfy the proximity query.
"""
query = query.split() # split the query into words
porter_stemmer = PorterStemmer() # initialize the stemmer
stopwords = get_stopwords() # get the stopwords
inverted_index, positional_index = extract_indexes() # extract the inverted and positional indexes
position = query.pop(-1)
position = int(position[-1])
if query[0] not in stopwords: # if the word is not a stopword, stem it
word = porter_stemmer.stem(query[0].lower()) # stem the first word in the query
if word[-1] == "'": # remove the apostrophe
word = word.rstrip("'")
query[0] = word
if query[1] not in stopwords: # if the word is not a stopword, stem it
word = porter_stemmer.stem(query[1].lower()) # stem the second word in the query
if word[-1] == "'": # remove the apostrophe
word = word.rstrip("'")
query[1] = word
docs = [] # create a list to store the postings list for each term in the query
if query[0] not in stopwords:
docs.append(inverted_index[query[0]]) # get the postings list for the first term
else:
docs.append([])
if query[1] not in stopwords:
docs.append(inverted_index[query[1]]) # get the postings list for the second term
else:
docs.append([])
common_docs = INTERSECTION(docs[0], docs[1]) # find the common documents in the postings list of the two terms
result = [] # create a list to store the result
for i in common_docs: # loop through the common documents
pp1 = positional_index[query[0]][i] # get the positions of the first term in the document
pp2 = positional_index[query[1]][i] # get the positions of the second term in the document
# now we need to find the positions of the second term that are within the specified proximity of the positions of the first term
j = 0
k = 0
while j != len(pp1):
while k != len(pp2):
if abs(pp1[j] - pp2[k]) <= position: # if the positions of the two terms are within the specified proximity, add the document to the result
if i not in result: # if the document is not already in the result, add it
result.append(i)
elif pp2[k] > pp1[j]: # if the position of the second term is greater than the position of the first term, break the loop
break
k+=1
j+=1
result = ''.join([str(c) + ' ' for c in result]) # convert the result to a string
return result
def process_query():
"""
This function retrieves a user's query from a GUI text entry field, processes the query, and displays the result in a GUI label.
The function assumes that the query is a proximity query if it contains a '/' character, and a boolean query otherwise.
"""
query = entry.get() # get the query from the text entry field
if '/' in query: # if the query contains a '/', it is a proximity query, so call the ProxQueryProcessing function
result = ProxQueryProcessing(query)
else: # otherwise, it is a boolean query, so call the BoolQueryProcessing function
result = BoolQueryProcessing(query)
if result == '': # if the result is empty, display a message
result = 'No documents found'
output_label.configure(state='normal') # enable the output label
output_label.delete(0, tk.END) # clear the output label
output_label.insert(0, result) # insert the result into the output label
output_label.configure(state='readonly') # again disable the output label
ctk.set_appearance_mode('Dark') # set the appearance mode to dark
ctk.set_default_color_theme('dark-blue') # set the default color theme
root = ctk.CTk() # create a new window
root.geometry('500x400') # set the window size
root.title('Boolean Retrieval Model') # set the window title
# create a label "Boolean Query Model" with a font size of 20 and transparent foreground color
label1 = ctk.CTkLabel(
root,
text="Boolean Query Model",
font=("Verdana", 20),
fg_color="transparent"
)
# place the label according to the given co-ordinated relative to x and y axis
label1.place(
relx=0.5,
rely=0.2,
anchor=tk.CENTER
)
# create another label "Enter Query" with a transparent foreground color
label2 = ctk.CTkLabel(
root,
text="Enter Query",
fg_color="transparent"
)
# place the at the center of the window
label2.place(
relx=0.5,
rely=0.3,
anchor=tk.CENTER
)
# create a text entry field with a width of 200 and a black background color
entry = ctk.CTkEntry(
root,
width=200,
bg_color='black'
)
# place the text entry field in the window
entry.place(
relx=0.5,
rely=0.4,
anchor=tk.CENTER
)
# create a button "Process Query" with a font size of 12 and white background color and black text color. The button calls the process_query function when clicked
process_button = ctk.CTkButton(
root,
text="Process Query",
font=("Helvetica", 12),
bg_color='white',
fg_color="#B6C8A9",
hover_color="white",
text_color = "black",
command=process_query
)
# place the button in the window
process_button.place(
relx=0.5,
rely=0.5,
anchor=tk.CENTER
)
# create a button "Exit" with a font size of 12 with white background color and black text color. The button terminates the window when clicked
exit_button = ctk.CTkButton(
root,
text="Exit",
font=("Helvetica", 12),
bg_color='white',
fg_color="#B6C8A9",
hover_color="white",
text_color = "black",
command=root.destroy
)
# place the button in the window
exit_button.place(
relx=0.5,
rely=0.6,
anchor=tk.CENTER
)
# create a text entry field with a width of 400, a height of 50, and a black background color
output_label = ctk.CTkEntry(
root,
width=400,
height=50,
bg_color='black'
)
# place the text entry field in the window
output_label.place(
relx=0.5,
rely=0.8,
anchor=tk.CENTER
)
# set the state of the text entry field to readonly (disable it)
output_label.configure(
state='readonly'
)
if __name__ == "__main__":
root.mainloop() # run the window