-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathIndex_creator.py
More file actions
136 lines (115 loc) · 3.38 KB
/
Index_creator.py
File metadata and controls
136 lines (115 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#Index,graph and rank creator
import urllib2
import operator
import json
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"',start_link)
end_quote = page.find('"',start_quote+1)
url = page[start_quote+1:end_quote]
return url, end_quote;
def get_all_links(page):
links=[]
while True:
url, endpos=get_next_target(page)
if url:
links.append(url)
page= page[endpos:]
else:
break
return links
def get_page(link):
try:
if link.find('mailto')!=-1:
return ''
req = urllib2.Request(link, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' })
html = urllib2.urlopen(req).read()
return html
except (urllib2.URLError,urllib2.HTTPError,ValueError) as e:
return ''
def split_string(source):
splitlist= ",!-.;/?@ #"
output=[]
atsplit = True
for char in source:
if char in splitlist:
atsplit=True
else:
if atsplit:
output.append(char)
atsplit= False
else:
output[-1] = output[-1] +char
return output
def lookup(index, keyword):
if keyword in index:
return index[keyword]
return None
def add_to_index(index,keyword,url):
if keyword in index:
values=[]
values=lookup(index,keyword)
if url not in values:
values.append(url)
else:
values=[]
values.append(url)
index[keyword]=values
def add_page_to_index(index,url,content):
words=split_string(content)
for word in words:
add_to_index(index,word,url)
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def compute_ranks(graph):
d = 0.8 # damping factor
numloops = 10
ranks = {}
npages = len(graph)
for page in graph:
ranks[page] = 1.0 / npages
for i in range(0, numloops):
newranks = {}
for page in graph:
newrank = (1 - d) / npages
for node in graph:
if page in graph[node]:
newrank+=d*(ranks[node]/len(graph[node]))
newranks[page] = newrank
ranks = newranks
return ranks
def crawl_web(seed,max_depth):
tocrawl=[seed];
crawled=[]
sum_depth=[0]
sum_each=0
i=0
index={}
graph={}
depth=0
next_depth=[]
while tocrawl and depth<= max_depth:
page= tocrawl.pop()
if page not in crawled:
content= get_page(page)
add_page_to_index(index,page,content)
outlinks=get_all_links(content)
graph[page]=outlinks
union(next_depth,outlinks)
crawled.append(page)
if not tocrawl:
tocrawl,next_depth= next_depth,[]
depth=depth+1
with open('index.txt', 'w') as file:
file.write(json.dumps(index))
return graph
seed='https://en.wikipedia.org/wiki/Main_Page'
d=input("Enter the depth of the index you want to create:")
graph=crawl_web(seed,d)
ranks=compute_ranks(graph)
with open('ranks.txt', 'w') as file:
file.write(json.dumps(ranks))