-
Notifications
You must be signed in to change notification settings - Fork 4
/
generate_index.py
37 lines (33 loc) · 1.49 KB
/
generate_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import sys
from os import listdir
import reuters
from vocabulary import preprocess
from spimi import spimi_invert, merge_blocks
def compile_inverted_index(block_size_limit):
""" Generate inverted index for Reuters21578 """
reuters_corpus = reuters.ReutersCorpus()
# Retrieve Reuters documents
print("=============== Retriving documents... =============== ")
documents = reuters_corpus.retrieveDocuments()
# Preprocessing documents
print("=============== Preprocessing documents... ===============")
documents = preprocess(documents)
# Perform Single-pass in-memory (SPIMI) indexing
print("=============== Applying SPIMI... ===============")
spimi_invert(documents, block_size_limit)
# Merge blocks into final index
print("=============== Merging SPIMI blocks into final inverted index... ===============")
spimi_blocks = [open('index_blocks/'+block) for block in listdir('index_blocks/')]
merge_blocks(spimi_blocks)
# while True: # keep running the program
# docID_input = input("Enter document ID for lookup:")
# print("You entered:", docID_input)
# print(documents[docID_input])
if __name__ == '__main__':
if len(sys.argv) > 1:
block_size_limit = int(sys.argv[1])
else:
# block_size_limit = 250000 # default block size (in bytes)
block_size_limit = 750000 # default block size (in bytes) 0.75 MB
print("Current block size limit: ", block_size_limit)
compile_inverted_index(block_size_limit)