-
Notifications
You must be signed in to change notification settings - Fork 0
/
pagerank.py
182 lines (147 loc) · 6.08 KB
/
pagerank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import os
import random
import re
import sys
DAMPING = 0.85
SAMPLES = 10000
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python pagerank.py corpus")
corpus = crawl(sys.argv[1])
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
print(f"PageRank Results from Sampling (n = {SAMPLES})")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
ranks = iterate_pagerank(corpus, DAMPING)
print(f"PageRank Results from Iteration")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
def crawl(directory):
"""
Parse a directory of HTML pages and check for links to other pages.
Return a dictionary where each key is a page, and values are
a set of all other pages in the corpus that are linked to by the page.
"""
pages = dict()
# Extract all links from HTML files
for filename in os.listdir(directory):
if not filename.endswith(".html"):
continue
with open(os.path.join(directory, filename)) as f:
contents = f.read()
links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
pages[filename] = set(links) - {filename}
# Only include links to other pages in the corpus
for filename in pages:
pages[filename] = set(link for link in pages[filename] if link in pages)
return pages
def transition_model(corpus, page, damping_factor):
"""
Return a probability distribution over which page to visit next,
given a current page.
With probability `damping_factor`, choose a link at random
linked to by `page`. With probability `1 - damping_factor`, choose
a link at random chosen from all pages in the corpus.
"""
probability_distribution = {}
page_links = list(corpus[page])
pages = list(corpus.keys())
# Initialize probabilities for all pages to 0
for i in corpus:
probability_distribution[i] = 0
if page_links:
# Assign probability to pages linked from the current page
for link in page_links:
probability_distribution[link] = damping_factor * (1 / len(page_links))
# Assign probability to all pages in the corpus
for page in pages:
probability_distribution[page] += (1 - damping_factor) * (1 / len(pages))
else:
# If the page has no outgoing links, assign equal probability to all pages
for page in pages:
probability_distribution[page] = 1 / len(pages)
return probability_distribution
def sample_pagerank(corpus, damping_factor, n):
"""
Return PageRank values for each page by sampling `n` pages
according to the transition model, starting with a page at random.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
original_pages = list(corpus.keys())
samples_pages = {}
random_sample_page = random.choice(original_pages)
# Initialize sample pages with probability 0
for page in original_pages:
samples_pages[page] = 0
# Select the first random sample page and assign it a probability of 1/n
samples_pages[random_sample_page] = 1 / n
# Calculate the probability distribution for the current page
prob_dist_current_page = transition_model(corpus, random_sample_page, damping_factor)
# Generate the remaining n-1 samples
for i in range(1, n):
# Select a new sample page based on the probability distribution
new_sample = random.choices(
list(prob_dist_current_page.keys()),
list(prob_dist_current_page.values()),
k=1
)[0]
# Update the probability of the new sample page
samples_pages[new_sample] += 1 / n
# Update the probability distribution for the current page based on the new sample
prob_dist_current_page = transition_model(corpus, new_sample, damping_factor)
return samples_pages
def iterate_pagerank(corpus, damping_factor):
"""
Return PageRank values for each page by iteratively updating
PageRank values until convergence.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
N = len(corpus)
old_values = {}
new_value = {}
convergence_threshold = 0.001
# Initialize the PageRank values with an equal probability of 1/N for each page
for page in corpus:
old_values[page] = 1 / N
# Iteratively update PageRank values until convergence
while True:
for page in corpus:
page_rank = 0
for link in corpus:
# Check if the current page is linked to by other pages
if page in corpus[link]:
page_rank += old_values[link] / len(corpus[link])
# If the current page has no links, interpret it as having one link for every other page
if len(corpus[link]) == 0:
page_rank += old_values[link] / len(corpus)
# Update the PageRank value for the current page using the damping factor
page_rank *= damping_factor
page_rank += (1 - damping_factor) / N
# Update the new PageRank value
new_value[page] = page_rank
# Calculate the maximum change (difference) in PageRank values
difference = max([abs(new_value[val] - old_values[val]) for val in old_values])
# Check for convergence by comparing the maximum change with the convergence threshold
if difference < ((1 / SAMPLES) * convergence_threshold):
break
else:
old_values = new_value.copy()
return old_values
if __name__ == "__main__":
main()
# Usage example:
# (base) razvansavin@AEGIS:~/ProiecteVechi/CS50AI/pagerank$ python pagerank.py corpus0
# PageRank Results from Sampling (n = 10000)
# 1.html: 0.2208
# 2.html: 0.4293
# 3.html: 0.2216
# 4.html: 0.1283
# PageRank Results from Iteration
# 1.html: 0.2199
# 2.html: 0.4292
# 3.html: 0.2199
# 4.html: 0.1310