-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpagerank.py
226 lines (196 loc) · 7.69 KB
/
pagerank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import os
import random
import re
import sys
DAMPING = 0.85
SAMPLES = 10000
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python pagerank.py corpus")
corpus = crawl(sys.argv[1])
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
print(f"PageRank Results from Sampling (n = {SAMPLES})")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
ranks = iterate_pagerank(corpus, DAMPING)
print(f"PageRank Results from Iteration")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
def crawl(directory):
"""
Parse a directory of HTML pages and check for links to other pages.
Return a dictionary where each key is a page, and values are
a list of all other pages in the corpus that are linked to by the page.
"""
pages = dict()
# Extract all links from HTML files
for filename in os.listdir(directory):
if not filename.endswith(".html"):
continue
with open(os.path.join(directory, filename)) as f:
contents = f.read()
links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
pages[filename] = set(links) - {filename}
# Only include links to other pages in the corpus
for filename in pages:
pages[filename] = set(
link for link in pages[filename]
if link in pages
)
return pages
def transition_model(corpus, page, damping_factor):
"""
Return a probability distribution over which page to visit next,
given a current page.
With probability `damping_factor`, choose a link at random
linked to by `page`. With probability `1 - damping_factor`, choose
a link at random chosen from all pages in the corpus.
"""
num_of_pages = len(corpus)
transition_m = {}
if len(corpus[page]) == 0:
for page_c in corpus:
transition_m[page_c] = round(1/num_of_pages, 4)
# Making sure it works as expected
"""
try:
sum_ = sums_dict(transition_m)
assert round(sum_ , 4)== 1
except:
print(sum_)
print(transition_m)
sys.exit()
"""
return transition_m
else:
uniform_prob = (1-damping_factor)/num_of_pages
for page_c in corpus:
transition_m[page_c] = round(uniform_prob, 4)
linked_pages_set = corpus[page]
num_linked_pages = len(linked_pages_set)
for linked_page in corpus[page]:
transition_m[linked_page] += round(damping_factor/num_linked_pages, 4)
# TODO: Making sure it works as expected
"""
try:
sum_ = sums_dict(transition_m)
assert round(sum_ , 4)== 1
except:
print(sum_)
print(transition_m)
sys.exit()
"""
return transition_m
def sums_dict(dict_):
sum_ = 0
for key in dict_:
sum_ += dict_[key]
return sum_
def sample_pagerank(corpus, damping_factor, n):
"""
Return PageRank values for each page by sampling `n` pages
according to transition model, starting with a page at random.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
sample_pr = {}
previous_page = list(corpus.keys())[random.randint(0, len(corpus)-1)]
for _ in range(n):
# Get the transition model from previous page
transition_m = transition_model(corpus, previous_page, damping_factor)
next_page = bernoulli_trial(normalize_tm(transition_m))
previous_page = next_page
previous_count = sample_pr.get(next_page, 0)
sample_pr[next_page] = previous_count + 1
for page in sample_pr:
sample_pr[page] /= n
return sample_pr
def normalize_tm(transition_m):
"""
Receives a transition model (a dictionary), then returns two lists.
One is the name of the page, and the other contains the cummulative
probabilities to perform a Bernoulli trial.
"""
prob = 0
page_names, norm_probs = [], []
for page, prob_value in transition_m.items():
page_names.append(page)
prob += prob_value
norm_probs.append(round(prob, 4))
assert len(page_names) == len(norm_probs), "Page names and Norm probs must be of the same size"
try:
assert norm_probs[-1] >= 0.99
except:
print("Last value in normalized probabilities less than expected",norm_probs[-1])
return [page_names, norm_probs]
def bernoulli_trial(page_names_norm_probs):
"""
Randomly chooses a page_name based on their corresponding
norm_probs. I.e. the higher the original probalitiy value of
a page from its transition model, the higher the chances it's selected.
Actually, the probs of being selected are equal to the prob values from
the transition model.
"""
# norm_probs example: [0.1, 0.45, 0.67, 0.99] Always in increasing order
page_names = page_names_norm_probs[0]
norm_probs = page_names_norm_probs[1]
rand_float = round(random.uniform(0, 1),5)
for index, norm_prob in enumerate(norm_probs):
if norm_prob > rand_float:
return page_names[index]
return page_names[-1]
def generate_parent_nodes(corpus):
"""
Goes through the corpus and creates a dictionary with pages as keys,
and the pages that link to those pages(keys) as values.
"""
parent_nodes = {}
for page in corpus:
linked_pages = corpus[page]
if linked_pages:
for linked_page in linked_pages:
parents_set = parent_nodes.get(linked_page, set())
parents_set.add(page)
parent_nodes[linked_page] = parents_set
else:
# if the page does not link to any page, we act as though
# it links to all the pages. (including itself)
for page2 in corpus:
parents_set = parent_nodes.get(page2, set())
parents_set.add(page)
parent_nodes[page2] = parents_set
return parent_nodes
def iterate_pagerank(corpus, damping_factor):
"""
Return PageRank values for each page by iteratively updating
PageRank values until convergence.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
iterate_pr = {}
parent_nodes = generate_parent_nodes(corpus)
num_of_pages = len(corpus)
uniform_prob = (1-damping_factor)/num_of_pages
difference = 1
for page in corpus:
iterate_pr[page] = 1/num_of_pages
while difference > 0.001:
largest_pr_delta = -1 # start at negative one because the new_pr_delta will be absolute values ( > 0)
for page in corpus:
summation = 0
for parent in parent_nodes[page]:
if len(corpus[parent]) > 0:
summation += iterate_pr[parent]/len(corpus[parent])
new_pr = uniform_prob + damping_factor * summation
new_pr_delta = abs(iterate_pr[page] - new_pr)
iterate_pr[page] = new_pr
if new_pr_delta > largest_pr_delta: # We don't want to stop as long as the condition is not met.
largest_pr_delta = new_pr_delta # so as soon as any pr change is greater than the largest difference,
# we must update. All of the pr changes have to be
# smaller than 0.001 (pr: Page Rank)
difference = largest_pr_delta
return iterate_pr
if __name__ == "__main__":
main()