From 73303cab1b4cf68fda08209daef4c3aa31d5e7da Mon Sep 17 00:00:00 2001 From: monosans Date: Tue, 23 Jan 2024 09:37:29 +0300 Subject: [PATCH] Improve response decoding performance --- proxy_scraper_checker/__main__.py | 1 + proxy_scraper_checker/http.py | 21 ++++++++++++++++++++- proxy_scraper_checker/proxy.py | 18 ++++++++++++++---- proxy_scraper_checker/scraper.py | 4 +++- proxy_scraper_checker/settings.py | 10 ++++++---- 5 files changed, 44 insertions(+), 10 deletions(-) diff --git a/proxy_scraper_checker/__main__.py b/proxy_scraper_checker/__main__.py index a93c2c5d8..8a3c44932 100644 --- a/proxy_scraper_checker/__main__.py +++ b/proxy_scraper_checker/__main__.py @@ -101,6 +101,7 @@ async def main() -> None: connector=TCPConnector(ssl=http.SSL_CONTEXT), headers=http.HEADERS, cookie_jar=http.get_cookie_jar(), + fallback_charset_resolver=http.fallback_charset_resolver, ) as session: settings = await Settings.from_mapping(cfg, session=session) storage = ProxyStorage(protocols=settings.sources) diff --git a/proxy_scraper_checker/http.py b/proxy_scraper_checker/http.py index 0e1402dda..66523e71a 100644 --- a/proxy_scraper_checker/http.py +++ b/proxy_scraper_checker/http.py @@ -5,9 +5,17 @@ from types import MappingProxyType import certifi -from aiohttp import DummyCookieJar, hdrs +from aiohttp import ClientResponse, DummyCookieJar, hdrs + +from .utils import bytes_decode SSL_CONTEXT = ssl.create_default_context(cafile=certifi.where()) + + +class NoCharsetHeaderError(Exception): + pass + + HEADERS: MappingProxyType[str, str] = MappingProxyType({ hdrs.USER_AGENT: ( "Mozilla/5.0 (Windows NT 10.0; rv:121.0) Gecko/20100101 Firefox/121.0" @@ -18,3 +26,14 @@ @lru_cache(None) def get_cookie_jar() -> DummyCookieJar: return DummyCookieJar() + + +def get_response_text(*, response: ClientResponse, content: bytes) -> str: + try: + return content.decode(response.get_encoding()) + except (NoCharsetHeaderError, UnicodeDecodeError): + return bytes_decode(content) + + +def fallback_charset_resolver(r: ClientResponse, b: bytes) -> str: # noqa: ARG001 + raise NoCharsetHeaderError diff --git a/proxy_scraper_checker/proxy.py b/proxy_scraper_checker/proxy.py index 8d255b4c7..ee40c04c3 100644 --- a/proxy_scraper_checker/proxy.py +++ b/proxy_scraper_checker/proxy.py @@ -9,10 +9,15 @@ from aiohttp import ClientSession from aiohttp_socks import ProxyConnector, ProxyType -from .http import HEADERS, SSL_CONTEXT, get_cookie_jar +from .http import ( + HEADERS, + SSL_CONTEXT, + fallback_charset_resolver, + get_cookie_jar, + get_response_text, +) from .parsers import parse_ipv4 from .settings import CheckWebsiteType, Settings -from .utils import bytes_decode @attrs.define( @@ -48,16 +53,21 @@ async def check(self, *, settings: Settings) -> None: headers=HEADERS, cookie_jar=get_cookie_jar(), timeout=settings.timeout, + fallback_charset_resolver=fallback_charset_resolver, ) as session, session.get( settings.check_website, raise_for_status=True ) as response: content = await response.read() self.timeout = perf_counter() - start if settings.check_website_type == CheckWebsiteType.HTTPBIN_IP: - r = json.loads(bytes_decode(content)) + r = json.loads( + get_response_text(response=response, content=content) + ) self.exit_ip = r["origin"] elif settings.check_website_type == CheckWebsiteType.PLAIN_IP: - self.exit_ip = parse_ipv4(bytes_decode(content)) + self.exit_ip = parse_ipv4( + get_response_text(response=response, content=content) + ) def as_str(self, *, include_protocol: bool) -> str: with StringIO() as buf: diff --git a/proxy_scraper_checker/scraper.py b/proxy_scraper_checker/scraper.py index 599879dfe..7fb59e7dc 100644 --- a/proxy_scraper_checker/scraper.py +++ b/proxy_scraper_checker/scraper.py @@ -11,6 +11,7 @@ from aiohttp_socks import ProxyType from rich.progress import Progress, TaskID +from .http import get_response_text from .parsers import PROXY_REGEX from .proxy import Proxy from .settings import Settings @@ -34,11 +35,12 @@ async def scrape_one( if is_url(source): async with session.get(source, timeout=timeout) as response: content = await response.read() + text = get_response_text(response=response, content=content) else: response = None async with aiofiles.open(source, "rb") as f: content = await f.read() - text = bytes_decode(content) + text = bytes_decode(content) except Exception as e: logger.warning( "%s | %s.%s: %s", diff --git a/proxy_scraper_checker/settings.py b/proxy_scraper_checker/settings.py index fd8e4b28e..0a3354101 100644 --- a/proxy_scraper_checker/settings.py +++ b/proxy_scraper_checker/settings.py @@ -25,10 +25,10 @@ from aiohttp_socks import ProxyType from . import sort +from .http import get_response_text from .null_context import NullContext from .parsers import parse_ipv4 from .typing_compat import Any, Literal, Self -from .utils import bytes_decode if TYPE_CHECKING: from .proxy import Proxy @@ -129,9 +129,11 @@ async def _get_check_website_type_and_real_ip( Tuple[Literal[CheckWebsiteType.PLAIN_IP, CheckWebsiteType.HTTPBIN_IP], str], ]: try: - async with session.get(check_website, raise_for_status=True) as r: - content = await r.read() - text = bytes_decode(content) + async with session.get( + check_website, raise_for_status=True + ) as response: + content = await response.read() + text = get_response_text(response=response, content=content) except Exception: logger.exception( "Error when opening check_website without proxy, it will be "