Skip to content

Commit

Permalink
feature: Introduce default_encoding parameter to set/autodetect the…
Browse files Browse the repository at this point in the history
… encoding if the charset is missing from the headers (#284)

* Add a `default_encoding` parameter to [set|autodetect] the encoding if no charset is found in the headers

* Update github workflow actions/*
  • Loading branch information
deedy5 authored Apr 10, 2024
1 parent 418e452 commit 96d4c52
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 42 deletions.
27 changes: 13 additions & 14 deletions .github/workflows/build-and-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ jobs:
name: Lint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- run: |
Expand All @@ -31,15 +31,15 @@ jobs:
name: Build sdist wheel
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- run: |
make preprocess
pipx run build --sdist
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v3 # https://github.com/actions/upload-artifact/issues/478
with:
path: ./dist/*.tar.gz

- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- run: |
Expand All @@ -52,51 +52,50 @@ jobs:
matrix:
os: [ubuntu-22.04, macos-12, macos-14, windows-2019]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'

- if: runner.os == 'Linux'
uses: docker/setup-qemu-action@v2
uses: docker/setup-qemu-action@v3
with:
platforms: all

# macOS make is too old
- if: runner.os == 'macOS'
run: |
brew install make automake libtool
which pipx || brew install pipx && pipx ensurepath
- name: Build and test wheels
uses: pypa/cibuildwheel@v2.16.5
uses: pypa/cibuildwheel@v2.17.0

# - name: Setup tmate session
# uses: mxschmitt/action-tmate@v3

- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v3 # https://github.com/actions/upload-artifact/issues/478
with:
path: ./wheelhouse/*.whl

upload_all:
needs: [bdist, sdist]
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v3
- uses: actions/download-artifact@v3 # https://github.com/actions/upload-artifact/issues/478
if: startsWith(github.ref, 'refs/tags/')
with:
name: artifact
path: dist

- uses: pypa/gh-action-pypi-publish@v1.5.0
- uses: pypa/gh-action-pypi-publish@v1.8.14
if: startsWith(github.ref, 'refs/tags/')
with:
password: ${{ secrets.PYPI_TOKEN }}

- name: Upload release files
if: startsWith(github.ref, 'refs/tags/')
uses: softprops/action-gh-release@v1
uses: softprops/action-gh-release@v2
with:
files: |
./dist/*.whl
Expand Down
4 changes: 4 additions & 0 deletions curl_cffi/requests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def request(
impersonate: Optional[Union[str, BrowserType]] = None,
thread: Optional[ThreadType] = None,
default_headers: Optional[bool] = None,
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
curl_options: Optional[dict] = None,
http_version: Optional[CurlHttpVersion] = None,
debug: bool = False,
Expand Down Expand Up @@ -90,6 +91,8 @@ def request(
impersonate: which browser version to impersonate.
thread: work with other thread implementations. choices: eventlet, gevent.
default_headers: whether to set default browser headers.
default_encoding: encoding for decoding response content if charset is not found in headers.
Defaults to "utf-8". Can be set to a callable for automatic detection.
curl_options: extra curl options to use.
http_version: limiting http version, http2 will be tries by default.
debug: print extra curl debug info.
Expand Down Expand Up @@ -122,6 +125,7 @@ def request(
content_callback=content_callback,
impersonate=impersonate,
default_headers=default_headers,
default_encoding=default_encoding,
http_version=http_version,
interface=interface,
multipart=multipart,
Expand Down
67 changes: 59 additions & 8 deletions curl_cffi/requests/models.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
import queue
import re
import warnings
from concurrent.futures import Future
from functools import cached_property
from json import loads
from typing import Any, Awaitable, Dict, List, Optional
from typing import Any, Awaitable, Callable, Dict, List, Optional, Union

from .. import Curl
from .cookies import Cookies
from .errors import RequestsError
from .headers import Headers

CHARSET_RE = re.compile(r"charset=([\w-]+)")


def clear_queue(q: queue.Queue):
with q.mutex:
Expand Down Expand Up @@ -41,6 +45,8 @@ class Response:
elapsed: how many seconds the request cost.
encoding: http body encoding.
charset: alias for encoding.
charset_encoding: encoding specified by the Content-Type header.
default_encoding: user-defined encoding used for decoding content if charset is not found in headers.
redirect_count: how many redirects happened.
redirect_url: the final redirected url.
http_version: http version used.
Expand All @@ -58,8 +64,7 @@ def __init__(self, curl: Optional[Curl] = None, request: Optional[Request] = Non
self.headers = Headers()
self.cookies = Cookies()
self.elapsed = 0.0
self.encoding = "utf-8"
self.charset = self.encoding
self.default_encoding: Union[str, Callable[[bytes], str]] = "utf-8"
self.redirect_count = 0
self.redirect_url = ""
self.http_version = 0
Expand All @@ -70,16 +75,62 @@ def __init__(self, curl: Optional[Curl] = None, request: Optional[Request] = Non
self.astream_task: Optional[Awaitable] = None
self.quit_now = None

@property
def charset(self) -> str:
"""Alias for encoding."""
return self.encoding

@property
def encoding(self) -> str:
"""
Determines the encoding to decode byte content into text.
The method follows a specific priority to decide the encoding:
1. If `.encoding` has been explicitly set, it is used.
2. The encoding specified by the `charset` parameter in the `Content-Type` header.
3. The encoding specified by the `default_encoding` attribute. This can either be
a string (e.g., "utf-8") or a callable for charset autodetection.
"""
if not hasattr(self, "_encoding"):
encoding = self.charset_encoding
if encoding is None:
if isinstance(self.default_encoding, str):
encoding = self.default_encoding
elif callable(self.default_encoding):
encoding = self.default_encoding(self.content)
self._encoding = encoding or "utf-8"
return self._encoding

@encoding.setter
def encoding(self, value: str) -> None:
if hasattr(self, "_text"):
raise ValueError("Cannot set encoding after text has been accessed")
self._encoding = value

@property
def charset_encoding(self) -> Optional[str]:
"""Return the encoding, as specified by the Content-Type header."""
content_type = self.headers.get("Content-Type")
if content_type:
charset_match = CHARSET_RE.search(content_type)
return charset_match.group(1) if charset_match else None
return None

@property
def text(self) -> str:
if not hasattr(self, "_text"):
if not self.content:
self._text = ""
else:
self._text = self._decode(self.content)
return self._text

def _decode(self, content: bytes) -> str:
try:
return content.decode(self.charset, errors="replace")
return content.decode(self.encoding, errors="replace")
except (UnicodeDecodeError, LookupError):
return content.decode("utf-8-sig")

@property
def text(self) -> str:
return self._decode(self.content)

def raise_for_status(self):
"""Raise an error if status code is not in [200, 400)"""
if not self.ok:
Expand Down
36 changes: 18 additions & 18 deletions curl_cffi/requests/session.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import asyncio
import math
import queue
import re
import threading
import warnings
from concurrent.futures import ThreadPoolExecutor
Expand Down Expand Up @@ -55,7 +54,6 @@ class ProxySpec(TypedDict, total=False):
else:
ProxySpec = Dict[str, str]

CHARSET_RE = re.compile(r"charset=([\w-]+)")
ThreadType = Literal["eventlet", "gevent"]


Expand Down Expand Up @@ -205,6 +203,7 @@ def __init__(
max_redirects: int = -1,
impersonate: Optional[Union[str, BrowserType]] = None,
default_headers: bool = True,
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
curl_options: Optional[dict] = None,
curl_infos: Optional[list] = None,
http_version: Optional[CurlHttpVersion] = None,
Expand All @@ -224,6 +223,7 @@ def __init__(
self.max_redirects = max_redirects
self.impersonate = impersonate
self.default_headers = default_headers
self.default_encoding = default_encoding
self.curl_options = curl_options or {}
self.curl_infos = curl_infos or []
self.http_version = http_version
Expand Down Expand Up @@ -547,7 +547,7 @@ def qput(chunk):

return req, buffer, header_buffer, q, header_recved, quit_now

def _parse_response(self, curl, buffer, header_buffer):
def _parse_response(self, curl, buffer, header_buffer, default_encoding):
c = curl
rsp = Response(c)
rsp.url = cast(bytes, c.getinfo(CurlInfo.EFFECTIVE_URL)).decode()
Expand Down Expand Up @@ -583,13 +583,7 @@ def _parse_response(self, curl, buffer, header_buffer):
rsp.cookies = self.cookies
# print("Cookies after extraction", self.cookies)

content_type = rsp.headers.get("Content-Type", default="")
charset_match = CHARSET_RE.search(content_type)
charset = charset_match.group(1) if charset_match else "utf-8"

rsp.charset = charset
rsp.encoding = charset # TODO use chardet

rsp.default_encoding = default_encoding
rsp.elapsed = cast(float, c.getinfo(CurlInfo.TOTAL_TIME))
rsp.redirect_count = cast(int, c.getinfo(CurlInfo.REDIRECT_COUNT))
rsp.redirect_url = cast(bytes, c.getinfo(CurlInfo.REDIRECT_URL)).decode()
Expand Down Expand Up @@ -639,6 +633,8 @@ def __init__(
max_redirects: max redirect counts, default unlimited(-1).
impersonate: which browser version to impersonate in the session.
interface: which interface use in request to server.
default_encoding: encoding for decoding response content if charset is not found in headers.
Defaults to "utf-8". Can be set to a callable for automatic detection.
Notes:
This class can be used as a context manager.
Expand Down Expand Up @@ -767,6 +763,7 @@ def request(
content_callback: Optional[Callable] = None,
impersonate: Optional[Union[str, BrowserType]] = None,
default_headers: Optional[bool] = None,
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
http_version: Optional[CurlHttpVersion] = None,
interface: Optional[str] = None,
cert: Optional[Union[str, Tuple[str, str]]] = None,
Expand Down Expand Up @@ -825,7 +822,7 @@ def perform():
try:
c.perform()
except CurlError as e:
rsp = self._parse_response(c, buffer, header_buffer)
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
rsp.request = req
cast(queue.Queue, q).put_nowait(RequestsError(str(e), e.code, rsp))
finally:
Expand All @@ -843,7 +840,7 @@ def cleanup(fut):

# Wait for the first chunk
cast(threading.Event, header_recved).wait()
rsp = self._parse_response(c, buffer, header_buffer)
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
header_parsed.set()

# Raise the exception if something wrong happens when receiving the header.
Expand All @@ -868,11 +865,11 @@ def cleanup(fut):
else:
c.perform()
except CurlError as e:
rsp = self._parse_response(c, buffer, header_buffer)
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
rsp.request = req
raise RequestsError(str(e), e.code, rsp) from e
else:
rsp = self._parse_response(c, buffer, header_buffer)
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
rsp.request = req
return rsp
finally:
Expand Down Expand Up @@ -919,6 +916,8 @@ def __init__(
allow_redirects: whether to allow redirection.
max_redirects: max redirect counts, default unlimited(-1).
impersonate: which browser version to impersonate in the session.
default_encoding: encoding for decoding response content if charset is not found in headers.
Defaults to "utf-8". Can be set to a callable for automatic detection.
Notes:
This class can be used as a context manager, and it's recommended to use via
Expand Down Expand Up @@ -1043,6 +1042,7 @@ async def request(
content_callback: Optional[Callable] = None,
impersonate: Optional[Union[str, BrowserType]] = None,
default_headers: Optional[bool] = None,
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
http_version: Optional[CurlHttpVersion] = None,
interface: Optional[str] = None,
cert: Optional[Union[str, Tuple[str, str]]] = None,
Expand Down Expand Up @@ -1093,7 +1093,7 @@ async def perform():
try:
await task
except CurlError as e:
rsp = self._parse_response(curl, buffer, header_buffer)
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
rsp.request = req
cast(asyncio.Queue, q).put_nowait(RequestsError(str(e), e.code, rsp))
finally:
Expand All @@ -1113,7 +1113,7 @@ def cleanup(fut):
# Unlike threads, coroutines does not use preemptive scheduling.
# For asyncio, there is no need for a header_parsed event, the
# _parse_response will execute in the foreground, no background tasks running.
rsp = self._parse_response(curl, buffer, header_buffer)
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)

first_element = _peek_aio_queue(cast(asyncio.Queue, q))
if isinstance(first_element, RequestsError):
Expand All @@ -1132,11 +1132,11 @@ def cleanup(fut):
await task
# print(curl.getinfo(CurlInfo.CAINFO))
except CurlError as e:
rsp = self._parse_response(curl, buffer, header_buffer)
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
rsp.request = req
raise RequestsError(str(e), e.code, rsp) from e
else:
rsp = self._parse_response(curl, buffer, header_buffer)
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
rsp.request = req
return rsp
finally:
Expand Down
Loading

0 comments on commit 96d4c52

Please sign in to comment.