-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
137 lines (113 loc) · 4.8 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
## A Web Crawler.
## Author : Amandeep
## version : v2.4
import urllib
import urllib.request
from bs4 import BeautifulSoup
class webcrawler:
# construtor
def __init__(self, address):
"""(str) -> None
Constructor to initaialize member variables it takes a valid url as input.
"""
self.tocrawl = set([])
if not address.startswith("http://"):
self.tocrawl.add("http://"+address)
else:
self.tocrawl.add(address)
self.crawled = []
self.crawl_web()
# get next link in the page.
# for manual parsing of page.
def next_link(self, page):
""" (str) -> str, int
returns the first link extracted form the page
>>> next_link('adnfjsnfsfsa <a href="http://www.google.co.in"sjnfsdfnksd')
('http://www.google.co.in', 39)
"""
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote+1 : end_quote]
return url, end_quote
# get all links from the page.
# for manual parsing of page.
def get_all_links(self, page):
""" (str) -> list of str
returns list of links extracted from page.
>>> find_all_links('</span><a href="http://yify-torrents.com/browse-movie/0/All/All/0/latest/2">2</a><a href="http://yify-torrents.com/browse-movie/0/All/All/0/latest/3">3</a><a href="http://yify-torrents.com/browse-movie/0/All/All/0/latest/4">4</a>')
http://yify-torrents.com/browse-movie/0/All/All/0/latest/2
http://yify-torrents.com/browse-movie/0/All/All/0/latest/3
http://yify-torrents.com/browse-movie/0/All/All/0/latest/4
>>> find_all_links('4</a><a href="http://yify-torrents.com/browse-movie/0/All/All/0/latest/5">5</a></span><a href="http://yify-torrents.com/browse-movie/0/All/All/0/latest/2">Next »</a><a href="http://yify-torrents.com/browse-movie/0/All/All/0/latest/166"><div class="browse-img">')
http://yify-torrents.com/browse-movie/0/All/All/0/latest/5
http://yify-torrents.com/browse-movie/0/All/All/0/latest/2
http://yify-torrents.com/browse-movie/0/All/All/0/latest/166
"""
links = []
while True:
link, end_pos = self.next_link(page)
if link:
links.append(link)
page = page[end_pos:]
else:
return links
# helper member function to combine two lists into one
def union(self, list1, list2):
""" (list, list) -> list
return the union of two lists.
>>> union([1, 2, 3], [2, 3, 4])
[1, 2, 3, 4]
>>> union([a, b, c], [c, d, e])
[a, b, c, d, e]
>>> union([1, 2, 3], [4, 5, 6])
[1, 2, 3, 4, 5, 6]
"""
for link in list2:
if link not in list1:
list1.append(link)
return list1
# function crawl web
def crawl_web(self):
""" (str) -> None
return the list of links from after crawling the seed page.
>>> crawl_web("http://www.gmail.com")
['http://www.gmail.com', 'http://www.google.com/support/accounts?hl=en', 'https://accounts.google.com/TOS?loc=IN&hl=en']
>>> crawl_web("http://www.torrentz.eu/")
['http://www.torrentz.eu/', 'http://torrentz-proxy.com/', 'https://torrentz.in/', 'https://torrentz.me/', 'https://torrentz.eu/']
"""
while self.tocrawl:
crawl_link = self.tocrawl.pop()
if crawl_link not in self.crawled:
try:
soup = self.get_page_data(crawl_link)
for link in soup.find_all('a'):
self.tocrawl.add(link.get('href'))
print(crawl_link)
self.crawled.append(crawl_link)
except :
pass
# Get data on a given page.
def get_page_data(self, link):
""" (str) -> str
returns the page data after extracting it.
"""
try:
req = urllib.request.urlopen(link)
page = str(req.read())
soup = BeautifulSoup(page)
except :
raise Exception("Invalid Link")
return soup
# get all crawled links
def get_links(self):
return self.crawled
# print all crawled links
def __str__(self):
print ("Crawled links are: ")
for link in self.crawled:
print (link)
link = input("Enter the link to be crawled : ")
webcrawler(link)