-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_crawler.py
250 lines (203 loc) · 8.97 KB
/
web_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/python3
# pylint: disable=E0401
"""
WebCrawler Module
This module defines a WebCrawler class that facilitates asynchronous web page
crawling, fetches links from HTML content, and calculates same-domain link
ratios.
It utilizes aiohttp for asynchronous HTTP requests and lxml for HTML parsing.
Classes:
- WebCrawler: A class for crawling web pages asynchronously and extracting
information.
Usage:
To use the WebCrawler class:
1. Create an instance of WebCrawler.
2. Use the crawl method to start crawling from a given URL to a specified
depth.
Example:
# Instantiate the WebCrawler
crawler = WebCrawler()
# Start crawling from a URL up to a depth and write output to a file
asyncio.run(crawler.crawl('https://example.com', 1, 3, output_file))
"""
from urllib.parse import urlparse, urljoin
from typing import Optional, Coroutine
import asyncio
import aiohttp
import aiofiles as aiof
from aiohttp import ClientSession, ClientError, InvalidURL, ClientTimeout
from asyncio import TimeoutError
from lxml import html
from config import DEFAULT_URL_PROTOCOL, DEFAULT_RETRY_COUNT, DEFAULT_BACKOFF
from utils import get_random_float
from log import LOGGER as log
class WebCrawler:
"""
WebCrawler Class
This class implements asynchronous web page crawling, link extraction, and
same-domain link ratio calculation functionalities.
It uses aiohttp for asynchronous HTTP requests and lxml for HTML parsing.
Attributes:
fetched_urls (set): A set containing visited URLs during crawling.
timeout (ClientTimeout): A ClientTimeout object, used for setting
request timeouts.
Methods:
- add_protocol(url: str, protocol: str = DEFAULT_URL_PROTOCOL) -> str: # noqa
Adds a protocol (HTTP/HTTPS) to a URL if not specified.
- fetch_page_async(session: ClientSession, url: str) -> Optional[bytes]:
Asynchronously fetches content of a web page using an HTTP GET request.
- fetch_links(html_content: bytes, base_url: str) -> set[str]:
Extracts links from HTML content given a base URL.
- calculate_same_domain_ratio(current_url: str, links: set) -> float:
Calculates the ratio of same-domain links to total links.
- crawl(url: str, depth: int, max_depth: int, output_file: TextIO) -> Optional[Coroutine]:
Crawls through web pages starting from a given URL up to a specified depth,
extracting information and writing results to an output file.
Usage:
1. Instantiate the WebCrawler class.
2. Use the crawl method to start crawling from a URL to a specified depth.
3. Implement additional functionalities as required.
Example:
# Instantiate the WebCrawler
crawler = WebCrawler()
# Start crawling from a URL up to a depth and write output to a file
asyncio.run(crawler.crawl('https://example.com', 1, 3, output_file))
"""
__slots__ = ("fetched_urls", "timeout",)
def __init__(self) -> None:
self.fetched_urls = set()
self.timeout = ClientTimeout(connect=5,
total=15)
def add_protocol(
self, url: str, protocol: str = DEFAULT_URL_PROTOCOL
) -> str:
"""
Adds a protocol (HTTP/HTTPS) to the provided URL if no protocol
is specified. Returns the modified URL with the added protocol.
Args:
- url (str): The URL to which the protocol will be added.
- protocol (str, optional): The protocol to be added if
the URL lacks a protocol. Defaults to 'https'.
Returns:
- str: The URL with the added protocol.
"""
parsed_url = urlparse(url)
if not parsed_url.scheme and not parsed_url.netloc:
return f"{protocol}://{url}"
return url
async def fetch_page_async(
self, session: ClientSession, url: str
) -> Optional[bytes]:
"""
Asynchronously fetches the content of a web page given its URL
using an HTTP GET request.
Args:
- session (ClientSession): The aiohttp ClientSession to use for
making the request.
- url (str): The URL of the web page to fetch.
Returns:
- Optional[bytes]: The content of the web page as bytes if
the request is successful (status code 200).
Returns None if there's an error or if the
status code is not 200.
"""
retry_count = DEFAULT_RETRY_COUNT
backoff = DEFAULT_BACKOFF
while retry_count > 0:
try:
async with session.get(url, timeout=self.timeout) as response:
if response.status == 200:
return await response.read()
return None
except InvalidURL:
log.error(f"Invalid URL for crawling: {url}")
return None
except TimeoutError:
log.error(f"Request timed out for url {url}")
except ClientError as ex:
log.exception(f"HTTP Client error: {ex}")
delay_with_jitter = backoff + get_random_float(0, 1)
await asyncio.sleep(delay_with_jitter)
backoff *= 2
retry_count -= 1
log.debug("Failed after multiple retries")
return None
def fetch_links(self, html_content: bytes, base_url: str) -> set[str]:
"""
Fetches links from HTML content given a base URL.
Args:
- html_content (bytes): The HTML content from which links are to
be extracted.
- base_url (str): The base URL used to resolve relative links.
Returns:
- Set[str]: A set of absolute URLs derived from the HTML content.
"""
try:
tree = html.fromstring(html_content)
links = tree.xpath("//a/@href")
links = [urljoin(base_url, link) for link in links]
return set(links)
except Exception as e:
log.exception(f"Error extracting links: {e}")
return set()
def calculate_same_domain_ratio(self, current_url: str, links: set) -> float: # noqa
"""
Calculates the ratio of same-domain links in comparison to the
total links provided.
Args:
- current_url (str): The URL of the current page.
- links (Set[str]): A set of URLs to calculate the same-domain
ratio.
Returns:
- float: The ratio of same-domain links to the total number of
links.
Returns 0.0 if the input set of links is empty.
"""
current_domain = urlparse(current_url).netloc
same_domain_count = sum(
urlparse(link).netloc == current_domain for link in links
) # noqa
return round(same_domain_count / len(links), 2) if len(links) > 0 else 0.0 # noqa
async def write_to_file(self, output_file: str, content: str) -> None:
async with aiof.open(output_file, 'a', encoding='utf-8') as file:
await file.write(content)
async def crawl(
self, url: str, depth: int, max_depth: int, output_file: str
) -> Optional[Coroutine]:
"""
Crawls through web pages starting from a given URL up to a
specified depth, extracting information and writing results to an
output file.
Args:
- url (str): The starting URL for crawling.
- depth (int): The current depth in the crawling tree.
- max_depth (int): The maximum depth to which the crawler should
explore.
- output_file (TextIO): An open file handler to write the output
data.
Returns:
- Optional[Coroutine]: Returns None if the depth exceeds the
max_depth or if the URL has been visited
before.
Note:
- The function uses an asynchronous approach for web page crawling.
"""
if depth > max_depth or url in self.fetched_urls:
return
self.fetched_urls.add(url)
async with aiohttp.ClientSession() as session:
try:
html_content = await self.fetch_page_async(session, url)
except Exception as e:
log.exception(f"Failed to crawl: {url}, with exception: {e}")
return
if html_content:
links = self.fetch_links(html_content, url)
same_domain_ratio = self.calculate_same_domain_ratio(url, links) # noqa
output_content = f"{url}\t{depth}\t{same_domain_ratio}\n"
await self.write_to_file(output_file, output_content)
tasks = [
self.crawl(link, depth + 1, max_depth, output_file)
for link in links
]
await asyncio.gather(*tasks)