-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcustom_index.py
126 lines (98 loc) · 4.2 KB
/
custom_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import nltk
import pickle
import itertools
from nltk import word_tokenize
from collections import Counter
from nltk.corpus import stopwords
nltk.download("stopwords")
class CustomIndex:
"""Class for building and searching an inverted index."""
def __init__(
self, load_from_file: bool = False, dir_name: str = "custom_index"
) -> None:
"""Initialize the Index.
Arguments:
load_from_file (bool): Whether to load the index from a file.
dir_name (str): The name of the dir to load the index from and save it to.
"""
self.cache: list[tuple] = []
self.index = {}
self.stop_words = set(stopwords.words("english"))
self.dir_name = dir_name
if load_from_file:
with open(f"index/{self.dir_name}.pickle", "rb") as file:
self.index = pickle.load(file)
else:
if not os.path.exists(f"index/{self.dir_name}"):
os.makedirs(f"index/{self.dir_name}")
def _preprocess(self, text: str) -> list[str]:
"""Preprocess the text by tokenizing it, removing stop words,
and removing non-alphanumeric characters.
Arguments:
text (str): The text to preprocess.
Returns:
list[str]: The preprocessed text.
"""
tokenized_text = word_tokenize(text.lower())
preprocessed_text = [
word
for word in tokenized_text
if word not in self.stop_words and word.isalnum()
]
return preprocessed_text
def add_to_cache(
self, title: str, first_paragraph: str, text: str, url: str
) -> None:
"""Add the text to the cache after preprocessing it and
counting the occurences of word (used for ranking).
Arguments:
title (str): The title of the page.
first_paragraph (str): The first paragraph of the page.
text (str): The enitire text of the page used to obtain the word frequencies.
url (str): The URL of the page.
"""
preprocessed_text = self._preprocess(text)
counted_words = Counter(preprocessed_text)
self.cache.append((counted_words, url, first_paragraph, title))
def build_index(self) -> None:
"""Build the index from the cache and save it to a pickle file."""
for counted_words, url, first_paragraph, title in self.cache:
for word, count in counted_words.items():
if word not in self.index:
self.index[word] = []
self.index[word].append((url, count, first_paragraph, title))
with open(f"index/{self.dir_name}/index.pickle", "wb") as file:
pickle.dump(self.index, file)
def search(self, query: str) -> list[list]:
"""Search the index for the query.
Arguments:
query (str): The query to search for.
Returns:
result (list[list]): A list of tuples containing the URL, total count,
first paragraph, and title, sort by total count.
"""
preprocessed_query = set(self._preprocess(query))
# Get search hits
search_hits = []
for word in preprocessed_query:
if word in self.index:
search_hits.extend(self.index[word])
# Sort and group data by URL
search_hits.sort(key=lambda x: x[0])
grouped_data = itertools.groupby(search_hits, key=lambda x: x[0])
# Create result list with URL, sum of counts, and sets of first_paragraph and title
# For compatibility with the WhooshIndex, the first list is empty
# instead of containing the corrected query
result = [[], []]
for url, group in grouped_data:
group_list = list(group)
total_count = sum(item[1] for item in group_list)
first_paragraph = next(
iter({item[2] for item in group_list}), "Preview unavailable"
)
title = next(iter({item[3] for item in group_list}), "Untitled")
result[1].append((url, total_count, first_paragraph, title))
# Sort the result by count
result[1].sort(key=lambda x: x[1], reverse=True)
return result