-
Notifications
You must be signed in to change notification settings - Fork 0
/
page_rank.py
93 lines (81 loc) · 4.96 KB
/
page_rank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import argparse
import os.path
import json
def pageRank(nodeDict, maxiterations, arglambda, threshold, iteration=0, currentPageRank={}):
if(iteration == 0):
# initialize page rank for each node. We assume that each PR is equal initially
for node in nodeDict:
currentPageRank[node] = 1/len(nodeDict)
if(iteration >= maxiterations):
return currentPageRank, iteration
# Begining of Page Rank calculation
newPageRank = {}
for node in nodeDict:
# get the list of nodes that point to this node
inNodes = nodeDict[node][0]
# calculate the page rank for this node
pRank = 0
for inNode in inNodes: # for each node that links to this node
numberOfOutNodes = len(nodeDict[inNode][1]) # get the number of outgoing links
pRank += currentPageRank[inNode]/numberOfOutNodes # add the page rank of the node divided by the number of it's outgoing links
pRank = (arglambda/len(nodeDict)) + ((1-arglambda)*pRank) # calculate the page rank for this node
newPageRank[node] = pRank # add the new page rank to the dictionary
# End of Page Rank calculation
# check if the change in page rank is less than the threshold
change = 0
for node in nodeDict:
change += abs(currentPageRank[node] - newPageRank[node])
if(change < threshold):
# return the new page rank and the number of iterations (iteration + 1 because we are returning the number of iterations that have been completed)
return newPageRank, iteration + 1
else:
# Otherwise call the function again with the new page rank
return pageRank(nodeDict, maxiterations, arglambda, threshold, iteration+1, newPageRank)
def readNodesfromFile(filename):
# Returns a dictionary of tuples with the key being the node number => N : (Nodes that point to N, Nodes that N points to)
nodeDict = {}
if os.path.isfile(filename):
with open(filename, 'r') as f:
for line in f:
if(line[0] == "#"):
continue # skip comments
source = int(line.split("\t")[0]) # first column is the source ID
dest = int(line.split("\t")[1]) # second column is the destination ID
if(source not in nodeDict):
# create a new entry in the dictionary
nodeDict[source] = ([], [])
if(dest not in nodeDict):
# create a new entry in the dictionary
nodeDict[dest] = ([], [])
nodeDict[source][1].append(dest) # add the destination to the list of nodes that the source points to
nodeDict[dest][0].append(source) # add the source to the list of nodes that are pointing to the destination
else:
print("File does not exist")
return nodeDict
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Page Rank Program')
parser.add_argument("-i", "--maxiteration", help="Maximum number of iterations the program will complete.", default=3, type=int)
parser.add_argument("-l", "--arglambda", help="The lambda / random surfer parameter value.", default=0.85, type=float)
parser.add_argument("-t", "--threshold", help="The threshold value. The program will stop once the total difference in all nodes page rank change is less than this value.", default=0.0001, type=float)
parser.add_argument("-n", "--nodes", help="The Node IDs that we want to get page rank from. Usage: -n/--nodes node1 node2 node3 ...", default=[], nargs='+', type=int) # nargs='+' means 1 or more
parser.add_argument("-d", "--dump", help="Dumps page rank to JSON file when True.",action="store_true", default=False)
args = parser.parse_args()
# Conditional print statements
if(len(args.nodes) == 0):
print("Finding Page Rank with max iterations: ", args.maxiteration, " lambda: ", args.arglambda, " threshold: ", args.threshold)
else:
print("Finding Page Rank for nodes: ", args.nodes, " with max iterations: ", args.maxiteration, " lambda: ", args.arglambda, " threshold: ", args.threshold)
# Read nodes from file and store them in a dictionary with the key being the node number => N : (Nodes that point to N, Nodes that N points to)
nodeDict = readNodesfromFile("web-Stanford.txt")
# Calculate page rank and store it in a dictionary with the key being the node number => N : Page Rank
pageRankDict, iteration = pageRank(nodeDict, args.maxiteration, args.arglambda, args.threshold)
print("Page Rank found after ", iteration, " iterations.")
# if the user specified nodes to get page rank from, print them out
if(len(args.nodes) > 0):
print("Node:\tPage Rank:")
for node in args.nodes:
print("{}\t{}".format(node, pageRankDict[node]))
# if the user specified to dump the page rank to a JSON file, do so
if(args.dump):
with open("pageRank.json", 'w') as f:
json.dump(pageRankDict, f, indent=4)