Skip to content

Commit

Permalink
Add raise_for_status=True when scraping sources
Browse files Browse the repository at this point in the history
  • Loading branch information
monosans committed Jan 23, 2024
1 parent 73303ca commit dca437f
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 16 deletions.
1 change: 1 addition & 0 deletions proxy_scraper_checker/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ async def main() -> None:
connector=TCPConnector(ssl=http.SSL_CONTEXT),
headers=http.HEADERS,
cookie_jar=http.get_cookie_jar(),
raise_for_status=True,
fallback_charset_resolver=http.fallback_charset_resolver,
) as session:
settings = await Settings.from_mapping(cfg, session=session)
Expand Down
4 changes: 1 addition & 3 deletions proxy_scraper_checker/geodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,7 @@ async def download_geodb(*, progress: Progress, session: ClientSession) -> None:
else None
)

async with session.get(
GEODB_URL, headers=headers, raise_for_status=True
) as response:
async with session.get(GEODB_URL, headers=headers) as response:
if response.status == 304: # noqa: PLR2004
logger.info(
"Latest geolocation database is already cached at %s",
Expand Down
5 changes: 2 additions & 3 deletions proxy_scraper_checker/proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,10 @@ async def check(self, *, settings: Settings) -> None:
connector=connector,
headers=HEADERS,
cookie_jar=get_cookie_jar(),
raise_for_status=True,
timeout=settings.timeout,
fallback_charset_resolver=fallback_charset_resolver,
) as session, session.get(
settings.check_website, raise_for_status=True
) as response:
) as session, session.get(settings.check_website) as response:
content = await response.read()
self.timeout = perf_counter() - start
if settings.check_website_type == CheckWebsiteType.HTTPBIN_IP:
Expand Down
13 changes: 6 additions & 7 deletions proxy_scraper_checker/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import aiofiles
import aiofiles.os
import aiofiles.ospath
from aiohttp import ClientSession, ClientTimeout
from aiohttp import ClientResponseError, ClientSession, ClientTimeout
from aiohttp_socks import ProxyType
from rich.progress import Progress, TaskID

Expand Down Expand Up @@ -41,6 +41,10 @@ async def scrape_one(
async with aiofiles.open(source, "rb") as f:
content = await f.read()
text = bytes_decode(content)
except ClientResponseError as e:
logger.warning(
"%s | HTTP status code %d: %s", source, e.status, e.message
)
except Exception as e:
logger.warning(
"%s | %s.%s: %s",
Expand All @@ -54,12 +58,7 @@ async def scrape_one(
try:
proxy = next(proxies)
except StopIteration:
if response and response.status != 200: # noqa: PLR2004
logger.warning(
"%s | HTTP status code %d", source, response.status
)
else:
logger.warning("%s | No proxies found", source)
logger.warning("%s | No proxies found", source)
else:
for proxy in itertools.chain((proxy,), proxies): # noqa: B020
try:
Expand Down
4 changes: 1 addition & 3 deletions proxy_scraper_checker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,7 @@ async def _get_check_website_type_and_real_ip(
Tuple[Literal[CheckWebsiteType.PLAIN_IP, CheckWebsiteType.HTTPBIN_IP], str],
]:
try:
async with session.get(
check_website, raise_for_status=True
) as response:
async with session.get(check_website) as response:
content = await response.read()
text = get_response_text(response=response, content=content)
except Exception:
Expand Down

0 comments on commit dca437f

Please sign in to comment.