-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsprank.py
110 lines (93 loc) · 3.02 KB
/
sprank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import sqlite3
conn = sqlite3.connect("spider.sqlite")
cur = conn.cursor()
# Find the ids that send out page rank - we only are interested
# in pages in the SCC that have in and out links
cur.execute("""SELECT DISTINCT from_id FROM Links""")
from_ids = list()
for row in cur:
from_ids.append(row[0])
# Find the ids that receive page rank
to_ids = list()
links = list()
cur.execute("""SELECT DISTINCT from_id, to_id FROM Links""")
for row in cur:
from_id = row[0]
to_id = row[1]
if from_id == to_id:
continue
if from_id not in from_ids:
continue
if to_id not in from_ids:
continue
links.append(row)
if to_id not in to_ids:
to_ids.append(to_id)
# Get latest page ranks for strongly connected component
prev_ranks = dict()
for node in from_ids:
cur.execute("""SELECT new_rank FROM Pages WHERE id = ?""", (node,))
row = cur.fetchone()
prev_ranks[node] = row[0]
sval = input("How many iterations:")
many = 1
if len(sval) > 0:
many = int(sval)
# Sanity check
if len(prev_ranks) < 1:
print("Nothing to page rank. Check data.")
quit()
# Lets do Page Rank in memory so it is really fast
for i in range(many):
# print prev_ranks.items()[:5]
next_ranks = dict()
total = 0.0
for node, old_rank in list(prev_ranks.items()):
total = total + old_rank
next_ranks[node] = 0.0
# print total
# Find the number of outbound links and sent the page rank down each
for node, old_rank in list(prev_ranks.items()):
# print node, old_rank
give_ids = list()
for from_id, to_id in links:
if from_id != node:
continue
# print ' ',from_id,to_id
if to_id not in to_ids:
continue
give_ids.append(to_id)
if len(give_ids) < 1:
continue
amount = old_rank / len(give_ids)
# print node, old_rank,amount, give_ids
for id in give_ids:
next_ranks[id] = next_ranks[id] + amount
newtot = 0
for node, next_rank in list(next_ranks.items()):
newtot = newtot + next_rank
evap = (total - newtot) / len(next_ranks)
# print newtot, evap
for node in next_ranks:
next_ranks[node] = next_ranks[node] + evap
newtot = 0
for node, next_rank in list(next_ranks.items()):
newtot = newtot + next_rank
# Compute the per-page average change from old rank to new rank
# As indication of convergence of the algorithm
totdiff = 0
for node, old_rank in list(prev_ranks.items()):
new_rank = next_ranks[node]
diff = abs(old_rank - new_rank)
totdiff = totdiff + diff
avediff = totdiff / len(prev_ranks)
print(i + 1, avediff)
# rotate
prev_ranks = next_ranks
# Put the final ranks back into the database
print(list(next_ranks.items())[:5])
cur.execute("""UPDATE Pages SET old_rank=new_rank""")
for id, new_rank in list(next_ranks.items()):
cur.execute("""UPDATE Pages SET new_rank=? WHERE id=?""", (new_rank, id))
conn.commit()
cur.close()