-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
138 lines (109 loc) · 3.89 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
Author:
Amin Dziri
"""
from utils import read_initial_urls, filter_urls
from urllib.parse import urlsplit
from urllib.robotparser import RobotFileParser
from urllib.error import URLError
from lxml import html
from scraper import Scraper
import urllib
import re
import requests
import time
PYDEVD_WARN_SLOW_RESOLVE_TIMEOUT = 1
class YahooCrawler:
def __init__(self, n_urls: int, initial_urls: str, base_page: str):
"""
Initialzes some Arguments
Args:
n_urls (int): Amount of URLs that should be parsed
initial_urls (str): Path to the inital_urls.txt file
base_page (str): URL on which the crawler should stay
"""
self.url_queue = read_initial_urls(initial_urls)
self.visited_urls = set()
self.scraper = Scraper()
self.base_page = base_page
self.n_urls = n_urls
def _check_allowance(self, url: str) -> bool:
"""
Checks for a given webpage-url whether its crawlable. The function checks the robots.txt of the
webpage.
Args:
url (str): The URL of the webpage
Returns:
bool: True if url is crawlable, False if otherwise
Raises:
/
"""
robots_txt_url = self._get_robots_txt_url(url)
robotParser = RobotFileParser()
robotParser.set_url(robots_txt_url)
robotParser.read()
return robotParser.can_fetch("*", url)
def _get_robots_txt_url(self, url: str) -> str:
"""
Returns for an URL the URL to the robots.txt file of this website
Args:
url (str): The URL of the webpage
Returns:
robot_txt_url (str): The URL to the robots.txt file of the page
Raises:
/
"""
split_url = urlsplit(url)
#robots.txt url is base_url + /robots.txt
base_url = split_url.scheme + '://' + split_url.netloc
robot_txt_url = base_url + '/' + "robots.txt"
return robot_txt_url
def _get_page_urls(self, url: str) -> list:
"""
Makes a get Request to the given URL Argument, parses the html and extracts all the URLs
Args:
url (str): An URL to a webpage
Returns:
urls (list): A List with all the URLs within the page_text
Raises:
/
"""
page_info = requests.get(url, timeout=2.50)
page_text = page_info.text
urls = re.findall(r'href=["\']?(https?://[^\s"\'<>]+)', page_text)
urls = filter_urls(urls, self.base_page)
return urls
def crawl_urls(self) -> None:
"""
Main Crawling loop
Args:
/
Returns:
/
Raises:
URLError, if the URL is not a real URL
"""
i = 1
start_time = time.time()
for url in self.url_queue:
print(f"Currently Scraping URL Nr.{i}, time: {(time.time() - start_time) / 3600} h")
i += 1
try:
self.url_queue.remove(url)
except URLError:
print(f"The URL {url} is not correct")
#Check if the current webpage was already visited
if url in self.visited_urls:
continue
#Check if webpage allows crawling
if not self._check_allowance(url):
continue
self.visited_urls.add(url)
crawled_urls = self._get_page_urls(url)
self.url_queue.extend(crawled_urls)
self.scraper.scrape(url)
if i > self.n_urls:
break
if __name__ == '__main__':
crawler = YahooCrawler(n_urls=50, initial_urls="initial_urls.txt", base_page="https://finance.yahoo.com/")
crawler.crawl_urls()