-
Notifications
You must be signed in to change notification settings - Fork 1
/
WebCrawler
59 lines (47 loc) · 1.87 KB
/
WebCrawler
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#-*- coding: utf-8 -*-
from HTMLParser import HTMLParser
from urllib2 import urlopen
import httplib
import re
class WebCrawler(HTMLParser):
def __init__(self, input_url, depth, limit):
HTMLParser.__init__(self)
self.url = input_url
self.link_dict = {self.url: 1}
self.node = [self.url]
self.depth = depth # recursion depth max
self.limit = limit # limit of links to be obtained per url
self.links_found = 0
def handle_starttag(self, tag, attrs):
if self.links_found < self.limit and tag == 'a' and attrs:
link = attrs[0][1]
if link[:4] != "http":
link = '/'.join(self.url.split('/')[:3])+('/'+link).replace('//','/')
if link not in self.link_dict:
print 'new link ---> %s' % link
self.links_found += 1
self.node.append(link)
self.link_dict[link] = (self.link_dict.get(link) or 0) + 1
def crawler(self):
for depth in xrange(self.depth):
print '#######Depth %d#########' % (depth+1)
context_node = self.node[:]
self.node = []
for self.url in context_node:
self.links_found = 0
try:
req = urlopen(self.url)
res = req.read()
print res
self.feed(res)
except:
self.reset
print '############URL CRAWLED###############'
extracted_links = [(v,k) for (k,v) in self.link_dict.items()]
extracted_links.sort()
return extracted_links
if __name__ == "__main__":
wc = WebCrawler(input_url= 'http://www.goal.com/en-india/', depth= 5 , limit = 10 )
result = wc.crawler()
for (n,link) in result:
print "%s was found %d time%s." %(link,n, "s" if n is not 1 else "")