-
Notifications
You must be signed in to change notification settings - Fork 0
/
webCrawler.py
143 lines (133 loc) · 5.22 KB
/
webCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
'''
Python3 based Web Crawler.
Author : Rishabh Verma
'''
import urllib.request
def get_page(url):
try:
if url == "https://www.rishabhverma.in/index.html":
return ('<html> <body> This is a test page for learning to crawl! '
'<p> It is a good idea to '
'<a href="https://www.rishabhverma.in/crawling.html">learn to '
'crawl</a> before you try to '
'<a href="https://www.rishabhverma.in/walking.html">walk</a> '
'or <a href="https://www.rishabhverma.in/flying.html">fly</a>. '
'</p> </body> </html> ')
elif url == "https://www.rishabhverma.in/crawling.html":
return ('<html> <body> I have not learned to crawl yet, but I '
'am quite good at '
'<a href="https://www.rishabhverma.in/kicking.html">kicking</a>.'
'</body> </html>')
elif url == "https://www.rishabhverma.in/walking.html":
return ('<html> <body> I cant get enough '
'<a href="https://www.rishabhverma.in/index.html">crawling</a>! '
'</body> </html>')
elif url == "https://www.rishabhverma.in/flying.html":
return ('<html> <body> The magic words are Squeamish Ossifrage! '
'</body> </html>')
except:
return ""
return ""
# Get content as text from Web Page.
'''
--> def get_page_internet(url):
This procedure Reads the web page from provided URL and return content of web page as text.
same text as viewed in page source. [with html tags]
'''
def get_page_internet(url):
url_read = urllib.request.urlopen(url)
pw = url_read.read()
read_line = str(pw)
return read_line
# Get first links[URL] encountered in HTML Page
'''
--> def get_next_target(page):
This procedure takes HTML Page as text input.
As it encounters the first "<a href=" tag it gets that URL.
Then it Change the start position of page to index after the first encountered URL.
Returns the URL and New start Index on that Page.
'''
def get_next_target(page):
start_link = page.find("<a href=")
# Stopping Condition
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_qoute = page.find('"', start_quote+1)
url = page[start_quote+1:end_qoute]
return url, end_qoute
# Appends URL in Crawled List from to_crawl List.
def union(p,q):
for e in q:
if e not in p:
p.append(e)
'''
--> def get_all_links(page):
Returns all the URL present in page and stores in links_list[].
'''
def get_all_links(page):
# Stores all the URL of passed HTML[seed page] page.
links_list = []
while True:
url, endpos = get_next_target(page)
if url:
# add links to List
links_list.append(url)
# Start page from new Index.
# New Index is after the last encountered URL.
page = page[endpos:]
else:
break
return links_list
# Data Structure using here is : Multimap
'''
index[] --> [keyword, [URL]] :Single Keyword multiple URL's
If we found an existing keyword of passed URL and Keyword in Index[] list then URL will be appended to that list.
If not found the we will append new list consisting --> [keyword, [URL]]
'''
def add_to_index(index, keyword, url):
for entry in index:
# Search for existence of passes keyword in list Index[].
if entry[0] == keyword:
# If Keyword exist, then append URL to that list.
entry[1].append(url)
return
# If not keyword not found then add new list with Keyword and URL.
index.append([keyword, [url]])
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
'''
# The procedure returns a list of the urls associated with the keyword.
# If the keyword is not in the index, the procedure returns an empty list.
'''
def lookup(index,keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return []
# Seed Page[HTML page which needs to be crawled] is passed as input. Returns all the crawled links.
# max_pages -> to set maximum number of pages that will be crawled.
def crawl_web(seed,max_pages):
# HTML pages to be crawled
tocrawl = [seed]
# HTML Pages crawled
crawled = []
index = []
while tocrawl:
# Returns last URL in tocrawl list and Removes it from tocrawl List.
last_url = tocrawl.pop()
# If provided URL is not already crawled previously.
if last_url not in crawled:
content = get_page_internet(last_url)
add_page_to_index(index, last_url, content)
# uncomment this to crawl sample pages provided in script[to test without internet connection]
#union(tocrawl, get_all_links(get_page(last_url)))
union(tocrawl, get_all_links(get_page_internet(last_url)))
crawled.append(last_url)
return index
print (crawl_web("https://www.rishabhverma.in/webCrawling/index.html",5))
#print (crawl_web("https://www.rishabhverma.in/index.html",3))
#print(get_page("https://www.rishabhverma.in/index.html"))
#print(get_page_internet("https://www.rishabhverma.in/webCrawling/"))