-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch_large_corpus.py
362 lines (344 loc) · 14 KB
/
search_large_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
import getopt
import json
import math
import os
import re
import sys
from files import porter
containNum = {}
path = []
def check_if_dir(file_path):
'''
This method is used to iterate over all files in a nested folder
'''
temp_list = os.listdir(file_path)
# Put file name from file_path in temp_list
for temp_list_each in temp_list:
if os.path.isfile(file_path + '/' + temp_list_each):
# Determines whether the file name is a file
temp_path = file_path + '/' + temp_list_each
# Splice file path
if temp_list_each == ".DS_Store":
# DS_Store files that don't need to be processed
continue
else:
path.append(temp_path)
# Add the file path to the list
else:
check_if_dir(file_path + '/' + temp_list_each) # loop traversal
return path
def read(input_str):
path = "documents" # Destination folder directory
files = check_if_dir(path) # Get the names of all the files in the folder
s = {}
collections = {}
if (os.path.isfile("index.txt")) is False:
'''
When the program is run for the first time,
an appropriate index is created so that IR can use the BM25 method
'''
for file in files: # Traverse file paths
if not os.path.isdir(file):
# Determine if it's a folder, open only if it's not
f = open(file, encoding='UTF-8') # Open file
iter_f = iter(f) # Creating Iterators
str = ""
for line in iter_f:
# Traversing a file, line by line, reading text
str = str + line
s[file] = str
# The text of each file is stored in the dictionary
'''
Dictionary keys: File path
Dictionary values: File text
'''
for str in s.keys():
did = str.split("/")
# Fetch document name
b = re.split(r'\W+', s[str])
# Processing text with regular expressions and removing punctuation
c = [word.lower() for word in b if len(word) > 0]
# Removing spaces and making split strings lowercase
collections[did[len(did) - 1]] = remove_stopword(c)
# The remove_stopword method is called to perform further processing on the punctuated string:
# 1) remove the stopword. 2) stemming 3) Calculate the frequency of occurrence of each term
# Store the processed text in the dictionary
'''
Set up dictionary to store document frequencies:
key: Document id
value is a dictionary:
key: Term
value: Frequency within that document
'''
with open('index.txt', 'w', encoding='UTF-8') as f:
f.write(json.dumps(collections, ensure_ascii=False))
'''
Store the dictionary in a txt file
so that the next calculation of BM25 can directly use the processed dictionary in the txt file,
reducing the time complexity
'''
else:
f = open('index.txt', encoding='UTF-8')
'''
When the program is run a second time,
if the index file exists,
it will use the stored dictionary directly
'''
for line in f:
collections = json.loads(line)
for frequency in collections.values():
for term in frequency.keys():
if term not in containNum:
# If this is the first occurrence of the word,
# the number of occurrences will be equal to 1
containNum[term] = 1
elif term in containNum:
# For each subsequent occurrence of this word,
# add 1 to the number of occurrences
containNum[term] = containNum[term] + 1
'''
Set up dictionary to store the total number of documents in the collection that contain term:
key: Term
value: Number
'''
if input_str == "automatic":
# Automatic mode
calculate_BM25(read_queries_auto(), collections)
# Calculate BM25
elif input_str == "interactive":
# Interaction mode
while True:
queries = read_queries_interactive()
if queries == "QUIT":
break
calculate_BM25_interactive(queries, collections)
# Calculate BM25
def remove_stopword(txt):
stemmer = porter.PorterStemmer()
frequency = {}
stopwords = []
f = open('files/stopwords.txt', encoding='UTF-8')
for line in f:
stopwords.append(line.rstrip())
# Reading stop words
for term in txt:
if term not in stopwords:
term = stemmer.stem(term)
# Stemming
if term not in frequency:
frequency[term] = 1
# If this is the first occurrence of the word in the current document,
# the word frequency will be equal to 1
if term not in containNum:
# If this is the first occurrence of the word,
# the number of occurrences will be equal to 1
containNum[term] = 1
elif term in containNum:
# For each subsequent occurrence of this word,
# add 1 to the number of occurrences
containNum[term] = containNum[term] + 1
else:
frequency[term] = frequency[term] + 1
# For each subsequent occurrence of the word in the current document,
# add 1 to the number of occurrences
return frequency
def queries_manage(txt):
# This method involves the removal of stop words and stemming of queries
stemmer = porter.PorterStemmer()
words = []
stopwords = []
f = open('files/stopwords.txt', encoding='UTF-8')
for line in f:
stopwords.append(line.rstrip())
for term in txt:
if term not in stopwords:
term = stemmer.stem(term)
words.append(term)
return words
def calculate_BM25(queries, dict):
dlength = len(dict)
# Number of documents
total_len = 0
result = {}
queries_bm25 = {}
k = 1
b = 0.75
for n in dict.keys():
total_len = total_len + len(dict[n])
# Calculate the total length of all documents
avg_len = total_len / len(dict)
# Calculate the average length of each document in the collection
for i in queries.keys():
# Iterate over the id of queries
doc_bm25 = {}
for j in dict.keys():
# Iterate over the ids of all documents
bm25 = 0
for term in queries[i]:
# Iterate over the terms of each query
if term in dict[j]:
# If the term is in the current document
bm25 = bm25 + (((dict[j][term] * (1 + k)) / (
dict[j][term] + k * ((1 - b) + (b * len(dict[j]) / avg_len)))) * math.log2(
((dlength - containNum[term] + 0.5) / (containNum[term] + 0.5))))
'''
bm25 = bm25 + fij * (1 + k) / fij + k * ((1-b) + b * len(dj) / avg_doclen) * ((N - ni + 0.5) / (ni + 0.5))
fij (dict[j][term]): frequency of term in doc
k, b: constants
len(dj) (len(dict[j])): length of dj (the number of terms in doc)
avg_doclen (avg_len): the average length of a document in the collection
N (dlength): total number of documents in the collection
n (containNum[term]): total number of documents in the collection that contain term
'''
doc_bm25[j] = bm25
'''
Set up dictionary to store bm25:
key: Document id
value: Bm25
'''
result[i] = doc_bm25
'''
Set up dictionary to store bm25 of each query:
key: Query id
value is a dictionary:
key: Document id
value: Bm25
'''
for i, j in zip(result.values(), queries):
query = sorted(i.items(), key=lambda x: x[1], reverse=True)
# Sort the bm25 for each doc from largest to smallest
queries_bm25[j] = query
# Store sorted bm25 for each document queried
with open('output.txt', 'w') as f:
for i in queries_bm25.values():
max_value = i[0][1]
# Since the numerator cannot be 0 when normalising,
# it is divided into the case where bm25 is 0 and the case where it is not 0
for j in range(0, 15):
if max_value != 0:
# The output file for the automatic runs should have 4 columns:
# Query ID, Document ID, Rank, Similarity Score.
f.write(
str(list(queries_bm25.keys())[list(queries_bm25.values()).index(i)]) + " " + str(
i[j][0]) + " " + str(
j + 1) + " " + str(round(i[j][1] / max_value, 4)) + " " + "\n")
else:
f.write(
str(list(queries_bm25.keys())[list(queries_bm25.values()).index(i)]) + " " + str(
i[j][0]) + " " + str(
j + 1) + " " + str(0) + " " + "\n")
def calculate_BM25_interactive(queries, dict):
dlength = len(dict)
# Number of documents
total_len = 0
result = {}
queries_bm25 = {}
k = 1
b = 0.75
for n in dict.keys():
total_len = total_len + len(dict[n])
# Calculate the total length of all documents
avg_len = total_len / len(dict)
# Calculate the average length of each document in the collection
for i in queries.keys():
# Iterate over the id of queries
doc_bm25 = {}
for j in dict.keys():
# Iterate over the ids of all documents
bm25 = 0
for term in queries[i]:
# Iterate over the terms of each query
if term in dict[j]:
# If the term is in the current document
bm25 = bm25 + (((dict[j][term] * (1 + k)) / (
dict[j][term] + k * ((1 - b) + (b * len(dict[j]) / avg_len)))) * math.log2(
((dlength - containNum[term] + 0.5) / (containNum[term] + 0.5))))
'''
bm25 = bm25 + fij * (1 + k) / fij + k * ((1-b) + b * len(dj) / avg_doclen) * ((N - ni + 0.5) / (ni + 0.5))
fij (dict[j][term]): frequency of term in doc
k, b: constants
len(dj) (len(dict[j])): length of dj (the number of terms in doc)
avg_doclen (avg_len): the average length of a document in the collection
N (dlength): total number of documents in the collection
n (containNum[term]): total number of documents in the collection that contain term
'''
doc_bm25[j] = bm25
'''
Set up dictionary to store bm25:
key: Document id
value: Bm25
'''
result[i] = doc_bm25
'''
Set up dictionary to store bm25 of each query:
key: Query id
value is a dictionary:
key: Document id
value: Bm25
'''
for i, j in zip(result.values(), queries):
query = sorted(i.items(), key=lambda x: x[1], reverse=True)
# Sort the bm25 for each doc from largest to smallest
queries_bm25[j] = query
# Store sorted bm25 for each document queried
print("Results for query: ")
for i in queries_bm25.values():
max_value = i[0][1]
# Since the numerator cannot be 0 when normalising,
# it is divided into the case where bm25 is 0 and the case where it is not 0
for j in range(0, 15):
if max_value != 0:
# The output for the interactive runs should have 4 columns:
# Query ID, Document ID, Rank, Similarity Score.
print(
str(list(queries_bm25.keys())[list(queries_bm25.values()).index(i)]) + " " + str(
i[j][0]) + " " + str(
j + 1) + " " + str(round(i[j][1] / max_value, 4)) + " " + "\n")
else:
print(
str(list(queries_bm25.keys())[list(queries_bm25.values()).index(i)]) + " " + str(
i[j][0]) + " " + str(
j + 1) + " " + str(0) + " " + "\n")
def read_queries_auto():
# This method is used to read and process the query document
queries = {}
f = open('files/queries.txt', encoding='gbk')
for line in f:
str = ""
str = str + line
b = re.split(r'\W+', str)
c = [word.lower() for word in b if len(word) > 0]
queries[c[0]] = queries_manage(c[1:])
return queries
def read_queries_interactive():
# This method is used to read and process the query entered by the user
queries = {}
str = input("Enter query: ")
if str == "QUIT":
return str
else:
b = re.split(r'\W+', str)
c = [word.lower() for word in b if len(word) > 0]
queries[len(queries) + 1] = queries_manage(c[1:])
return queries
def main(argv):
# This method is used to read and parse the commands entered by the user in the terminal
input_str = ""
try:
opts, args = getopt.getopt(argv[1:], "m:")
# Get the command after m
except getopt.GetoptError as e:
print(e.msg)
sys.exit(2)
# Print command input error
for opt, arg in opts:
if opt in "-m":
input_str = arg
if input_str == "interactive":
# Incoming interactive mode
read(input_str)
elif input_str == "automatic":
# Incoming automatic mode
read(input_str)
if __name__ == "__main__":
main(sys.argv)