From ea7dceb4d70cf5bedc00665f036fec3e6d1a2421 Mon Sep 17 00:00:00 2001 From: Jonas Dedden Date: Tue, 13 Aug 2024 15:49:06 +0200 Subject: [PATCH] Include py.typed file & fix and/or add missing/wrong type hints in stub files (#50) * Include py.typed file to make fastwarc PEP 561 compliant * Add explicit exporting of symbols in __init__.py * Add/correct missing/wrong type hints in stub files * Widen stream typehints by accepting a _GenericIOStream Protocol * Expose all possible Enum values of WarcRecordType at top-level --- fastwarc/fastwarc/__init__.py | 13 +++++++++ fastwarc/fastwarc/py.typed | 0 fastwarc/fastwarc/stream_io.pyi | 31 ++++++++++++++++----- fastwarc/fastwarc/tools.pyi | 18 ++++++------- fastwarc/fastwarc/warc.pyi | 48 +++++++++++++++++++++++++-------- 5 files changed, 83 insertions(+), 27 deletions(-) create mode 100644 fastwarc/fastwarc/py.typed diff --git a/fastwarc/fastwarc/__init__.py b/fastwarc/fastwarc/__init__.py index d5206b94..8016b8b1 100644 --- a/fastwarc/fastwarc/__init__.py +++ b/fastwarc/fastwarc/__init__.py @@ -15,3 +15,16 @@ from .stream_io import FileStream, GZipStream, LZ4Stream from .stream_io import FastWARCError, StreamError from .warc import ArchiveIterator, WarcRecord, WarcRecordType + +# Exposing symbols for legacy compatibility, please prefer explicit imports from submodules + +__all__ = [ + "FileStream", + "GZipStream", + "LZ4Stream", + "FastWARCError", + "StreamError", + "ArchiveIterator", + "WarcRecord", + "WarcRecordType" +] diff --git a/fastwarc/fastwarc/py.typed b/fastwarc/fastwarc/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/fastwarc/fastwarc/stream_io.pyi b/fastwarc/fastwarc/stream_io.pyi index 4b39f212..5559aff1 100644 --- a/fastwarc/fastwarc/stream_io.pyi +++ b/fastwarc/fastwarc/stream_io.pyi @@ -1,18 +1,34 @@ -from typing import ContextManager, IO +from types import TracebackType +from typing import ContextManager, Optional, Type, Union, BinaryIO, Protocol + +class _GenericIOStream(Protocol): + def write(self, data: bytes) -> int: ... + def flush(self) -> None: ... + def read(self, size: int) -> bytes: ... + def seek(self, offset: int) -> int: ... + def close(self) -> None: ... + def tell(self) -> int: ... -class IOStream(ContextManager): +class IOStream(ContextManager[IOStream]): def read(self, size: int) -> bytes: ... def write(self, data: bytes) -> int: ... def close(self) -> None: ... def flush(self) -> None: ... def seek(self, offset: int) -> None: ... def tell(self) -> int: ... + def __enter__(self) -> IOStream: ... + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + traceback: Optional[TracebackType] + ) -> None: ... class BufferedReader: def __init__( - self, stream: IOStream, buf_size: int = 8192, negotiate_stream: bool = True + self, stream: Union[IOStream, BinaryIO, _GenericIOStream], buf_size: int = 65536, negotiate_stream: bool = True ) -> None: ... def close(self) -> None: ... def consume(self, size: int = -1) -> int: ... @@ -22,6 +38,7 @@ class BufferedReader: class BytesIOStream(IOStream): + def __init__(self, initial_data: Union[bytes, None] = None) -> None: ... def getvalue(self) -> bytes: ... @@ -36,20 +53,20 @@ class CompressingStream(IOStream): class BrotliStream(CompressingStream): def __init__( - self, raw_stream: IOStream, quality: int = 11, lgwin: int = 22, lgblock: int = 0 + self, raw_stream: Union[IOStream, BinaryIO, _GenericIOStream], quality: int = 11, lgwin: int = 22, lgblock: int = 0 ) -> None: ... class GZipStream(CompressingStream): def __init__( - self, raw_stream: IOStream, compression_level: int = 9, zlib: bool = False + self, raw_stream: Union[IOStream, BinaryIO, _GenericIOStream], compression_level: int = 9, zlib: bool = False ) -> None: ... class LZ4Stream(CompressingStream): def __init__( self, - raw_stream: IOStream, + raw_stream: Union[IOStream, BinaryIO, _GenericIOStream], compression_level: int = 12, favor_dec_speed: bool = True, ) -> None: ... @@ -57,7 +74,7 @@ class LZ4Stream(CompressingStream): class PythonIOStreamAdapter(IOStream): - def __init__(self, py_stream: IO) -> None: ... + def __init__(self, py_stream: _GenericIOStream) -> None: ... class FastWARCError(Exception): diff --git a/fastwarc/fastwarc/tools.pyi b/fastwarc/fastwarc/tools.pyi index b51fe5d8..367a5e23 100644 --- a/fastwarc/fastwarc/tools.pyi +++ b/fastwarc/fastwarc/tools.pyi @@ -1,7 +1,7 @@ from enum import IntFlag -from typing import Union, Type, Iterator, Tuple +from typing import Union, Iterator, Tuple, Protocol -from .stream_io import IOStream +from .stream_io import IOStream, _GenericIOStream from .warc import WarcRecord @@ -16,16 +16,16 @@ def detect_compression_algorithm(file: str) -> CompressionAlg: ... def wrap_warc_stream( - file: Union[str, Type[IOStream]], + file: Union[str, IOStream, _GenericIOStream], mode: str, comp_alg: CompressionAlg = CompressionAlg.auto, **comp_args -) -> Type[IOStream]: ... +) -> IOStream: ... def recompress_warc_interactive( - warc_in: Union[str, Type[IOStream]], - warc_out: Union[str, Type[IOStream]], + warc_in: Union[str, IOStream, _GenericIOStream], + warc_out: Union[str, IOStream, _GenericIOStream], comp_alg_in: CompressionAlg = CompressionAlg.auto, comp_alg_out: CompressionAlg = CompressionAlg.auto, **comp_args @@ -33,8 +33,8 @@ def recompress_warc_interactive( def recompress_warc( - warc_in: Union[str, Type[IOStream]], - warc_out: Union[str, Type[IOStream]], + warc_in: Union[str, IOStream, _GenericIOStream], + warc_out: Union[str, IOStream, _GenericIOStream], comp_alg_in: CompressionAlg = CompressionAlg.auto, comp_alg_out: CompressionAlg = CompressionAlg.auto, **comp_args @@ -42,7 +42,7 @@ def recompress_warc( def verify_digests( - warc_in: Union[str, Type[IOStream]], + warc_in: Union[str, IOStream], verify_payloads: bool = False, comp_alg: CompressionAlg = CompressionAlg.auto, ) -> bool: ... diff --git a/fastwarc/fastwarc/warc.pyi b/fastwarc/fastwarc/warc.pyi index e7a860fe..80576bdf 100644 --- a/fastwarc/fastwarc/warc.pyi +++ b/fastwarc/fastwarc/warc.pyi @@ -1,18 +1,20 @@ from datetime import datetime from typing import ( + Union, Optional, Iterator, Dict, Tuple, - MutableMapping, + Literal, + Callable, Iterable, ValuesView, KeysView, - Type, + BinaryIO, ) from enum import IntFlag -from .stream_io import BufferedReader, IOStream +from .stream_io import BufferedReader, IOStream, _GenericIOStream class WarcRecordType(IntFlag): @@ -29,24 +31,38 @@ class WarcRecordType(IntFlag): no_type = 0 +warcinfo = WarcRecordType.warcinfo +response = WarcRecordType.response +resource = WarcRecordType.resource +request = WarcRecordType.request +metadata = WarcRecordType.metadata +revisit = WarcRecordType.revisit +conversion = WarcRecordType.conversion +continuation = WarcRecordType.continuation +unknown = WarcRecordType.unknown no_type = WarcRecordType.no_type any_type = WarcRecordType.any_type -class WarcHeaderMap(MutableMapping[str, str]): +class WarcHeaderMap: reason_phrase: Optional[str] status_code: Optional[str] status_line: str def append(self, key: str, value: str) -> None: ... def asdict(self) -> Dict[str, str]: ... - def astuples(self) -> Tuple[str, str]: ... + def astuples(self) -> Tuple[Tuple[str, str], ...]: ... def clear(self) -> None: ... def get(self, key: str, default: Optional[str] = None) -> Optional[str]: ... def items(self) -> Iterator[Tuple[str, str]]: ... def keys(self) -> KeysView[str]: ... def values(self) -> ValuesView[str]: ... def write(self, stream: IOStream) -> None: ... + def __getitem__(self, item: str) -> str: ... + def __iter__(self) -> Iterator[Tuple[str, str]]: ... + def __len__(self) -> int: ... + def __setitem__(self, key: str, value: str) -> None: ... + def __contains__(self, item: str) -> bool: ... class WarcRecord: @@ -59,32 +75,42 @@ class WarcRecord: is_http_parsed: bool http_headers: Optional[WarcHeaderMap] http_content_type: Optional[str] - http_content_type: Optional[str] http_charset: Optional[str] http_date: Optional[datetime] http_last_modified: Optional[datetime] - content_length: int reader: BufferedReader stream_pos: int def init_headers( - self, content_length: int = 0, record_type=no_type, record_urn=None - ): ... + self, content_length: int = 0, record_type: WarcRecordType = no_type, record_urn: Optional[bytes] = None + ) -> None: ... def freeze(self) -> bool: ... def set_bytes_content(self, content: bytes) -> None: ... - def parse_http(self, strict_mode=True, auto_decode: str = "none") -> None: ... + def parse_http(self, strict_mode: bool = True, auto_decode: str = "none") -> None: ... def verify_block_digest(self, consume: bool = False) -> bool: ... def verify_payload_digest(self, consume: bool = False) -> bool: ... + def write( + self, + stream: Union[IOStream, BinaryIO, _GenericIOStream], + checksum_data: bool = False, + payload_digest: Optional[bytes] = None, + chunk_size: int = 16384 + ) -> int: ... + class ArchiveIterator(Iterable[WarcRecord]): def __init__( self, - stream: Type[IOStream], + stream: Union[IOStream, BinaryIO, _GenericIOStream], record_types: WarcRecordType = any_type, parse_http: bool = True, min_content_length: int = -1, max_content_length: int = -1, + func_filter: Optional[Callable[[WarcRecord], bool]] = None, + verify_digests: bool = False, + strict_mode: bool = True, + auto_decode: Literal["none", "content", "transfer", "all"] = "none", ) -> None: ... def __iter__(self) -> Iterator[WarcRecord]: ... def __next__(self) -> WarcRecord: ...