-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.py
306 lines (267 loc) · 15.1 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import math
import xml.etree.ElementTree as et
import file_io
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))
import numpy as np
from nltk.stem import PorterStemmer
nltk_test = PorterStemmer()
from collections import defaultdict
import re
import sys
class Index:
def __init__(self, xml_file : str, title_file : str, docs_file: str, words_file: str):
"""Main method, initializes global dictionaries
Parameters:
:param xml_file: the name of the input file that the indexer will read in and parse
:param title_file: maps document IDs to document titles
:param doc_file: stores the rankings computed by PageRank
:param words_file: stores the relevance of documents to words
Returns: none
Throws: none
"""
self.xml_file = xml_file
self.title_file = title_file
self.docs_file = docs_file
self.words_file = words_file
self.tokenization_regex = '''\[\[[^\[]+?\]\]|[a-zA-Z0-9]+'[a-zA-Z0-9]+|[a-zA-Z0-9]+'''
self.link_regex = '''\[\[[^\[]+?\]\]'''
wiki_tree = et.parse(self.xml_file) #loads the xml file into a tree
root = wiki_tree.getroot()#get the root of the tree
self.all_pages = root.findall('page')#this gets all the pages
self.file_io = file_io
self.word_to_id_to_count = {} #dictinary mapping words to doc ids to the amount of times a word appears in that specific doc
self.all_page_ids = set() #set of all page ids
self.weights_dict = {} #instantiating weights_dict which is a double dictionary that maps a doc id to another doc id which maps to the weight between the external doc id and the internal doc id
self.id_to_links = defaultdict(set)
self.title_to_page_id = {} #dictionary mapping page title to its id
self.ids_to_titles = {} #dictionary mapping page id to its title
self.id_to_highest_freq = {} #dictionary mapping a doc id to the number of occurances of the term with the highest frequency in the doc
self.term_to_num_of_docs= {} #dictionary mapping each term to the number of documents it appears in
self.idf_dict = {} #dictionary mapping word to its idf
self.words_to_ids_to_relevance= {} #dictionary mapping words to internal dict of the ids of the docs it appears in mapped to the relevance score
self.tf_dict= {} #dictionary mapping word/term to an internal dictionary that maps doc ids that the word appears in to the term frequency of the word in the specific doc
self.ids_to_pageRank_dict= {} #dictionary mapping doc ids to its pagerank score
self.populate_title_page_id()
self.parse()
self.populate_weights_dict()
self.compute_page_rank()
self.compute_most_freq_count_and_n_i()
self.compute_idf()
self.compute_term_frequency()
self.compute_term_relevance()
self.write_docs_file()
self.write_words_file()
self.write_title_file()
def parse(self):
"""Parses the inputted xml file, populates id_to_links, calls populate_word_to_ids_to_counts to populate word_to_ids_to_counts dict
Parameters: none
Returns:none
"""
if len(self.all_pages) == 0:
print("Empty wiki")
for page in self.all_pages:#looping through all the pages
page_title: str = (page.find('title').text).lower() #getting the title of each page.
if page.find('text').text != None:
page_text: str = (page.find('text').text).lower() #getting the text of each page (as a str)
else:
page_text: str = ''
id: int = int(page.find('id').text)
self.all_page_ids.add(id) #keeps track of all the page ids
page_tokens = re.findall(self.tokenization_regex,page_title +' '+ page_text)
links = re.findall(self.link_regex, page_text)
for term in page_tokens: #looping through a list of words
if term in links:#if the word is link
sliced_address = self.handle_Links(term,True)
if sliced_address!= self.ids_to_titles[id] and sliced_address in self.title_to_page_id:
self.id_to_links[id].add(self.title_to_page_id[sliced_address])
sliced_text_links= self.handle_Links(term, False)
sliced_links_token = re.findall(self.tokenization_regex, sliced_text_links)#tokenizes link texts
for word in sliced_links_token:
processed_link_text = self.remove_stop_words_and_stem(word)
self.populate_word_to_ids_to_counts(processed_link_text, id)
else: #if the word is not a link
processed_link_text= self.remove_stop_words_and_stem(term)
self.populate_word_to_ids_to_counts(processed_link_text, id)
#populating both title title_to_page_id and ids_to_titles at the same time increases efficiency as we don't have to loop through the pages twice for both dict
def populate_title_page_id(self):
"""Populates title_to_page_id and ids_to_titles dictionaries
Parameters: none
Returns: none
Throws: none
"""
for page in self.all_pages: #looping through all pages
title: str = ((page.find('title').text).strip()).lower() #getting title
id: int = int(page.find('id').text)
self.title_to_page_id[title] = id #mapping title to id
self.ids_to_titles[id] = title #mapping id to title
def write_title_file(self):
"""Writes in a file for the titles
Parameters: none
Returns: none
Throws: none
"""
self.file_io.write_title_file(self.title_file, self.ids_to_titles)
def handle_Links(self, term : str, page_link : bool):
"""Processes words in text that are links
Parameters:
term -- word that is a link
page_link -- boolean if the word returned should be the link address or the link text
Return:
no_bar or sliced -- a string of the processed link
Throws: none
"""
if "|" in term: #if text of link is different than the page it links too
sliced = term[2:-2] #obtaining the word from term without the link brackets by extracting all letters except the first and last two characters
no_bar = sliced.split("|") #splitting the word along the bar to obtain a list of the word(s) before and after the bar
if page_link == True: #if the desired returned word is the link address
return no_bar[0] #return the left word(s) of the bar
else:
return no_bar[1] #return the right word(s) of the bar
else: #if text of link is the same as the page it links too
sliced = term[2:-2]
return sliced
def remove_stop_words_and_stem(self, term: str):
"""If the argument is not a stop word, the word is reduced to the base root/stem of the word
Parameters:
term -- string to be stripped of stop words and stemmed
Returns: string of processed input string
Throws: none
"""
if term not in STOP_WORDS: #if the word is not a stop word, the word is stemmed
processed_word = nltk_test.stem(term)
return processed_word
def populate_word_to_ids_to_counts(self, word_stem: str, id : int):
"""Fills the double dictionary that maps words to an internal dictionary of ids mapped to the word count
Parameters:
word_stem -- string of the stemmed word, key of the outer dictionary
id -- integer id of the doc that the word_stem appears in
Returns: none
Throws: none
"""
if word_stem != None: #if word is not null/none
if word_stem not in self.word_to_id_to_count: #if word is not yet put in outer dictionary
self.word_to_id_to_count[word_stem] = {} #creating internal dictionary that the word_stem maps to
if id not in self.word_to_id_to_count[word_stem]: #if id not yet in the internal dictionary that word_stem maps to
self.word_to_id_to_count[word_stem][id] = 0 #id mapped to 0
self.word_to_id_to_count[word_stem][id]+=1 #increasing count by 1
def compute_most_freq_count_and_n_i(self):
"""Populates term_to_num_of_docs dictionary and id_to_highest_freq dictionary
Parameters: none
Returns: none
Throws: none
"""
words = self.word_to_id_to_count.keys() #extracting all words in word_to_id_to_count
for word in words: #looping thru each word
ids= self.word_to_id_to_count[word].keys() #extracting all ids
for id in ids: #looping thru each id
if word not in self.term_to_num_of_docs:
self.term_to_num_of_docs[word] = 0
self.term_to_num_of_docs[word]+=1 #counting each doc that the term appears in
if id not in self.id_to_highest_freq:
self.id_to_highest_freq[id] = 0
self.id_to_highest_freq[id] = max(self.id_to_highest_freq[id],\
self.word_to_id_to_count[word][id]) #mapping id to the highest frequency by comparing the value that the id is currently mapped to to the count of the current word
def compute_idf(self):
""""Computing the idf of a word and populating idf_dict
Parameters: none
Returns: none
"""
n = len(self.all_pages) #counting how many pages there are in total
words= self.word_to_id_to_count.keys()#all the words in the dictionary
for word in words: #looping through each word
n_i = self.term_to_num_of_docs[word] #extracting and storing the number of documents a word appears in
self.idf_dict[word] = math.log(n/n_i) #calculating idf and mapping word to idf in idf_dict
def compute_term_frequency(self):
"""Calculates term frequency and populates tf_dict
Param: none
Returns: none
"""
words= self.word_to_id_to_count.keys() #obtains all words that appear in documents
for word in words: #looping through each word
ids = self.word_to_id_to_count[word].keys() #obtains all ids of the docs that the current word appears in
if word not in self.tf_dict:
self.tf_dict[word]= {}
for id in ids:
if id not in self.tf_dict[word]:
self.tf_dict[word][id]= 0
term_frequency = self.word_to_id_to_count[word][id]
self.tf_dict[word][id]=term_frequency/self.id_to_highest_freq[id]
def compute_term_relevance(self):
"""Computes term relevance and populating words_to_ids_to_relevance dict
Param: none
Returns: none
"""
words=self.word_to_id_to_count.keys() #obtaining all words that appear in a document
for word in words:
ids = self.word_to_id_to_count[word].keys() #obtaining the ids of docs that the word appears in
if word not in self.words_to_ids_to_relevance:
self.words_to_ids_to_relevance[word] = {}
for id in ids:
if id not in self.words_to_ids_to_relevance[word]:
self.words_to_ids_to_relevance[word][id] = 0
self.words_to_ids_to_relevance[word][id]= self.idf_dict[word] * self.tf_dict[word][id] #calculating term freq and mapping the internal dict doc id to term frequency
def write_words_file(self):
"""Once we have term frequency and words_to_ids_to_relevance dict populated, the words file can be created
Param: none
Returns: none"""
self.file_io.write_words_file(self.words_file, self.words_to_ids_to_relevance)
def populate_weights_dict(self):
"""Calculating weights and populating weights_dict
Param: none
Returns: none
"""
n = len(self.all_page_ids) #obtaining the amount of pages in wiki
e = 0.15 #instantiating e
for j in self.all_page_ids: #looping thru all ids
if j not in self.weights_dict: #mapping current id to internal dictionary
self.weights_dict[j] = {}
for k in self.all_page_ids: #looping through all ids again, followed are cases comparing docs
if j == k: #case calculating/mapping the weight when a doc is linked to itself
self.weights_dict[j][k] = e/n
elif k in self.id_to_links[j]: #calculating/mapping weight if j is liked to k
nk= len(self.id_to_links[j])
self.weights_dict[j][k] = e/n + (1-e)*1/nk
elif self.id_to_links[j] == set():#calculating/mapping weight if j is not linked to anything
nk = n-1
self.weights_dict[j][k] = e/n + (1-e)*1/nk
else: #calculating/mapping weight if any other case
self.weights_dict[j][k] = e/n
def euclidean_distance(self,page_rank: dict, r : dict):
"""Helper method calculating euclidean distance for final ranking
Parameters:
page_rank -- dictionary of ids mapped to pagerank value
r-- dictionary that maps ids to pagerank value
Returns: Euclidean Distance"""
r_dict = list(r.items()) #getting the KV pairs and storing in tuple
page_rank_dict = list(page_rank.items())
r_array= np.array(r_dict) #converting dictionary to an array
page_rank_array = np.array(page_rank_dict)
dist = np.sqrt(np.sum(np.square(r_array-page_rank_array))) #computing euclidean distance
return dist
def compute_page_rank(self):
"""Computes PageRank values and populates ids_to_pageRank_dict
Param: none
Returns: none
"""
r_dict = {}
n = len(self.all_page_ids)
delta = 0.001
for id in self.all_page_ids:
r_dict[id] = 0 #initializes every rank in r to be 0
self.ids_to_pageRank_dict[id] = 1/n #initializes every rank in r' to be 1/n
while self.euclidean_distance(self.ids_to_pageRank_dict,r_dict) > delta: #while loop runs and populates ids_to_pagerank_dict until the rank computation converges sufficiently
r_dict = self.ids_to_pageRank_dict.copy() #setting r dict to be the current pagerank dict
for j in self.all_page_ids: #looping through page ids
self.ids_to_pageRank_dict[j] = 0
for k in self.all_page_ids:
self.ids_to_pageRank_dict[j] = self.ids_to_pageRank_dict[j] + self.weights_dict[k][j] * r_dict[k] #computing rank
def write_docs_file(self):
"""Once we have the ranking values of our docs we can create and write the docs file"""
self.file_io.write_docs_file(self.docs_file,self.ids_to_pageRank_dict)
if __name__ == "__main__":
"""Main method"""
if len(sys.argv)-1: #checks if the amount of arguments are appropriate
var = Index(*sys.argv[1:])
else:
print('Usage: <XML filepath> <titles filepath> <docs filepath> <words filepath>')