Skip to content

Commit

Permalink
Improve response decoding performance
Browse files Browse the repository at this point in the history
  • Loading branch information
monosans committed Jan 23, 2024
1 parent 31a5096 commit 73303ca
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 10 deletions.
1 change: 1 addition & 0 deletions proxy_scraper_checker/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ async def main() -> None:
connector=TCPConnector(ssl=http.SSL_CONTEXT),
headers=http.HEADERS,
cookie_jar=http.get_cookie_jar(),
fallback_charset_resolver=http.fallback_charset_resolver,
) as session:
settings = await Settings.from_mapping(cfg, session=session)
storage = ProxyStorage(protocols=settings.sources)
Expand Down
21 changes: 20 additions & 1 deletion proxy_scraper_checker/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,17 @@
from types import MappingProxyType

import certifi
from aiohttp import DummyCookieJar, hdrs
from aiohttp import ClientResponse, DummyCookieJar, hdrs

from .utils import bytes_decode

SSL_CONTEXT = ssl.create_default_context(cafile=certifi.where())


class NoCharsetHeaderError(Exception):
pass


HEADERS: MappingProxyType[str, str] = MappingProxyType({
hdrs.USER_AGENT: (
"Mozilla/5.0 (Windows NT 10.0; rv:121.0) Gecko/20100101 Firefox/121.0"
Expand All @@ -18,3 +26,14 @@
@lru_cache(None)
def get_cookie_jar() -> DummyCookieJar:
return DummyCookieJar()


def get_response_text(*, response: ClientResponse, content: bytes) -> str:
try:
return content.decode(response.get_encoding())
except (NoCharsetHeaderError, UnicodeDecodeError):
return bytes_decode(content)


def fallback_charset_resolver(r: ClientResponse, b: bytes) -> str: # noqa: ARG001
raise NoCharsetHeaderError
18 changes: 14 additions & 4 deletions proxy_scraper_checker/proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,15 @@
from aiohttp import ClientSession
from aiohttp_socks import ProxyConnector, ProxyType

from .http import HEADERS, SSL_CONTEXT, get_cookie_jar
from .http import (
HEADERS,
SSL_CONTEXT,
fallback_charset_resolver,
get_cookie_jar,
get_response_text,
)
from .parsers import parse_ipv4
from .settings import CheckWebsiteType, Settings
from .utils import bytes_decode


@attrs.define(
Expand Down Expand Up @@ -48,16 +53,21 @@ async def check(self, *, settings: Settings) -> None:
headers=HEADERS,
cookie_jar=get_cookie_jar(),
timeout=settings.timeout,
fallback_charset_resolver=fallback_charset_resolver,
) as session, session.get(
settings.check_website, raise_for_status=True
) as response:
content = await response.read()
self.timeout = perf_counter() - start
if settings.check_website_type == CheckWebsiteType.HTTPBIN_IP:
r = json.loads(bytes_decode(content))
r = json.loads(
get_response_text(response=response, content=content)
)
self.exit_ip = r["origin"]
elif settings.check_website_type == CheckWebsiteType.PLAIN_IP:
self.exit_ip = parse_ipv4(bytes_decode(content))
self.exit_ip = parse_ipv4(
get_response_text(response=response, content=content)
)

def as_str(self, *, include_protocol: bool) -> str:
with StringIO() as buf:
Expand Down
4 changes: 3 additions & 1 deletion proxy_scraper_checker/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from aiohttp_socks import ProxyType
from rich.progress import Progress, TaskID

from .http import get_response_text
from .parsers import PROXY_REGEX
from .proxy import Proxy
from .settings import Settings
Expand All @@ -34,11 +35,12 @@ async def scrape_one(
if is_url(source):
async with session.get(source, timeout=timeout) as response:
content = await response.read()
text = get_response_text(response=response, content=content)
else:
response = None
async with aiofiles.open(source, "rb") as f:
content = await f.read()
text = bytes_decode(content)
text = bytes_decode(content)
except Exception as e:
logger.warning(
"%s | %s.%s: %s",
Expand Down
10 changes: 6 additions & 4 deletions proxy_scraper_checker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
from aiohttp_socks import ProxyType

from . import sort
from .http import get_response_text
from .null_context import NullContext
from .parsers import parse_ipv4
from .typing_compat import Any, Literal, Self
from .utils import bytes_decode

if TYPE_CHECKING:
from .proxy import Proxy
Expand Down Expand Up @@ -129,9 +129,11 @@ async def _get_check_website_type_and_real_ip(
Tuple[Literal[CheckWebsiteType.PLAIN_IP, CheckWebsiteType.HTTPBIN_IP], str],
]:
try:
async with session.get(check_website, raise_for_status=True) as r:
content = await r.read()
text = bytes_decode(content)
async with session.get(
check_website, raise_for_status=True
) as response:
content = await response.read()
text = get_response_text(response=response, content=content)
except Exception:
logger.exception(
"Error when opening check_website without proxy, it will be "
Expand Down

0 comments on commit 73303ca

Please sign in to comment.