Skip to content

Commit

Permalink
Include py.typed file & fix and/or add missing/wrong type hints in st…
Browse files Browse the repository at this point in the history
…ub files (#50)

* Include py.typed file to make fastwarc PEP 561 compliant
* Add explicit exporting of symbols in __init__.py
* Add/correct missing/wrong type hints in stub files
* Widen stream typehints by accepting a _GenericIOStream Protocol
* Expose all possible Enum values of WarcRecordType at top-level
  • Loading branch information
jonded94 committed Aug 13, 2024
1 parent 066e5a1 commit ea7dceb
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 27 deletions.
13 changes: 13 additions & 0 deletions fastwarc/fastwarc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,16 @@
from .stream_io import FileStream, GZipStream, LZ4Stream
from .stream_io import FastWARCError, StreamError
from .warc import ArchiveIterator, WarcRecord, WarcRecordType

# Exposing symbols for legacy compatibility, please prefer explicit imports from submodules

__all__ = [
"FileStream",
"GZipStream",
"LZ4Stream",
"FastWARCError",
"StreamError",
"ArchiveIterator",
"WarcRecord",
"WarcRecordType"
]
Empty file added fastwarc/fastwarc/py.typed
Empty file.
31 changes: 24 additions & 7 deletions fastwarc/fastwarc/stream_io.pyi
Original file line number Diff line number Diff line change
@@ -1,18 +1,34 @@
from typing import ContextManager, IO
from types import TracebackType
from typing import ContextManager, Optional, Type, Union, BinaryIO, Protocol

class _GenericIOStream(Protocol):
def write(self, data: bytes) -> int: ...
def flush(self) -> None: ...
def read(self, size: int) -> bytes: ...
def seek(self, offset: int) -> int: ...
def close(self) -> None: ...
def tell(self) -> int: ...


class IOStream(ContextManager):
class IOStream(ContextManager[IOStream]):
def read(self, size: int) -> bytes: ...
def write(self, data: bytes) -> int: ...
def close(self) -> None: ...
def flush(self) -> None: ...
def seek(self, offset: int) -> None: ...
def tell(self) -> int: ...
def __enter__(self) -> IOStream: ...
def __exit__(
self,
exc_type: Optional[Type[BaseException]],
exc: Optional[BaseException],
traceback: Optional[TracebackType]
) -> None: ...


class BufferedReader:
def __init__(
self, stream: IOStream, buf_size: int = 8192, negotiate_stream: bool = True
self, stream: Union[IOStream, BinaryIO, _GenericIOStream], buf_size: int = 65536, negotiate_stream: bool = True
) -> None: ...
def close(self) -> None: ...
def consume(self, size: int = -1) -> int: ...
Expand All @@ -22,6 +38,7 @@ class BufferedReader:


class BytesIOStream(IOStream):
def __init__(self, initial_data: Union[bytes, None] = None) -> None: ...
def getvalue(self) -> bytes: ...


Expand All @@ -36,28 +53,28 @@ class CompressingStream(IOStream):

class BrotliStream(CompressingStream):
def __init__(
self, raw_stream: IOStream, quality: int = 11, lgwin: int = 22, lgblock: int = 0
self, raw_stream: Union[IOStream, BinaryIO, _GenericIOStream], quality: int = 11, lgwin: int = 22, lgblock: int = 0
) -> None: ...


class GZipStream(CompressingStream):
def __init__(
self, raw_stream: IOStream, compression_level: int = 9, zlib: bool = False
self, raw_stream: Union[IOStream, BinaryIO, _GenericIOStream], compression_level: int = 9, zlib: bool = False
) -> None: ...


class LZ4Stream(CompressingStream):
def __init__(
self,
raw_stream: IOStream,
raw_stream: Union[IOStream, BinaryIO, _GenericIOStream],
compression_level: int = 12,
favor_dec_speed: bool = True,
) -> None: ...
def prepopulate(self, initial_data: bytes) -> None: ...


class PythonIOStreamAdapter(IOStream):
def __init__(self, py_stream: IO) -> None: ...
def __init__(self, py_stream: _GenericIOStream) -> None: ...


class FastWARCError(Exception):
Expand Down
18 changes: 9 additions & 9 deletions fastwarc/fastwarc/tools.pyi
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from enum import IntFlag
from typing import Union, Type, Iterator, Tuple
from typing import Union, Iterator, Tuple, Protocol

from .stream_io import IOStream
from .stream_io import IOStream, _GenericIOStream
from .warc import WarcRecord


Expand All @@ -16,33 +16,33 @@ def detect_compression_algorithm(file: str) -> CompressionAlg: ...


def wrap_warc_stream(
file: Union[str, Type[IOStream]],
file: Union[str, IOStream, _GenericIOStream],
mode: str,
comp_alg: CompressionAlg = CompressionAlg.auto,
**comp_args
) -> Type[IOStream]: ...
) -> IOStream: ...


def recompress_warc_interactive(
warc_in: Union[str, Type[IOStream]],
warc_out: Union[str, Type[IOStream]],
warc_in: Union[str, IOStream, _GenericIOStream],
warc_out: Union[str, IOStream, _GenericIOStream],
comp_alg_in: CompressionAlg = CompressionAlg.auto,
comp_alg_out: CompressionAlg = CompressionAlg.auto,
**comp_args
) -> Iterator[Tuple[WarcRecord, int]]: ...


def recompress_warc(
warc_in: Union[str, Type[IOStream]],
warc_out: Union[str, Type[IOStream]],
warc_in: Union[str, IOStream, _GenericIOStream],
warc_out: Union[str, IOStream, _GenericIOStream],
comp_alg_in: CompressionAlg = CompressionAlg.auto,
comp_alg_out: CompressionAlg = CompressionAlg.auto,
**comp_args
) -> Iterator[Tuple[WarcRecord, int]]: ...


def verify_digests(
warc_in: Union[str, Type[IOStream]],
warc_in: Union[str, IOStream],
verify_payloads: bool = False,
comp_alg: CompressionAlg = CompressionAlg.auto,
) -> bool: ...
48 changes: 37 additions & 11 deletions fastwarc/fastwarc/warc.pyi
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
from datetime import datetime
from typing import (
Union,
Optional,
Iterator,
Dict,
Tuple,
MutableMapping,
Literal,
Callable,
Iterable,
ValuesView,
KeysView,
Type,
BinaryIO,
)
from enum import IntFlag

from .stream_io import BufferedReader, IOStream
from .stream_io import BufferedReader, IOStream, _GenericIOStream


class WarcRecordType(IntFlag):
Expand All @@ -29,24 +31,38 @@ class WarcRecordType(IntFlag):
no_type = 0


warcinfo = WarcRecordType.warcinfo
response = WarcRecordType.response
resource = WarcRecordType.resource
request = WarcRecordType.request
metadata = WarcRecordType.metadata
revisit = WarcRecordType.revisit
conversion = WarcRecordType.conversion
continuation = WarcRecordType.continuation
unknown = WarcRecordType.unknown
no_type = WarcRecordType.no_type
any_type = WarcRecordType.any_type


class WarcHeaderMap(MutableMapping[str, str]):
class WarcHeaderMap:
reason_phrase: Optional[str]
status_code: Optional[str]
status_line: str

def append(self, key: str, value: str) -> None: ...
def asdict(self) -> Dict[str, str]: ...
def astuples(self) -> Tuple[str, str]: ...
def astuples(self) -> Tuple[Tuple[str, str], ...]: ...
def clear(self) -> None: ...
def get(self, key: str, default: Optional[str] = None) -> Optional[str]: ...
def items(self) -> Iterator[Tuple[str, str]]: ...
def keys(self) -> KeysView[str]: ...
def values(self) -> ValuesView[str]: ...
def write(self, stream: IOStream) -> None: ...
def __getitem__(self, item: str) -> str: ...
def __iter__(self) -> Iterator[Tuple[str, str]]: ...
def __len__(self) -> int: ...
def __setitem__(self, key: str, value: str) -> None: ...
def __contains__(self, item: str) -> bool: ...


class WarcRecord:
Expand All @@ -59,32 +75,42 @@ class WarcRecord:
is_http_parsed: bool
http_headers: Optional[WarcHeaderMap]
http_content_type: Optional[str]
http_content_type: Optional[str]
http_charset: Optional[str]
http_date: Optional[datetime]
http_last_modified: Optional[datetime]
content_length: int
reader: BufferedReader
stream_pos: int

def init_headers(
self, content_length: int = 0, record_type=no_type, record_urn=None
): ...
self, content_length: int = 0, record_type: WarcRecordType = no_type, record_urn: Optional[bytes] = None
) -> None: ...
def freeze(self) -> bool: ...
def set_bytes_content(self, content: bytes) -> None: ...
def parse_http(self, strict_mode=True, auto_decode: str = "none") -> None: ...
def parse_http(self, strict_mode: bool = True, auto_decode: str = "none") -> None: ...
def verify_block_digest(self, consume: bool = False) -> bool: ...
def verify_payload_digest(self, consume: bool = False) -> bool: ...
def write(
self,
stream: Union[IOStream, BinaryIO, _GenericIOStream],
checksum_data: bool = False,
payload_digest: Optional[bytes] = None,
chunk_size: int = 16384
) -> int: ...



class ArchiveIterator(Iterable[WarcRecord]):
def __init__(
self,
stream: Type[IOStream],
stream: Union[IOStream, BinaryIO, _GenericIOStream],
record_types: WarcRecordType = any_type,
parse_http: bool = True,
min_content_length: int = -1,
max_content_length: int = -1,
func_filter: Optional[Callable[[WarcRecord], bool]] = None,
verify_digests: bool = False,
strict_mode: bool = True,
auto_decode: Literal["none", "content", "transfer", "all"] = "none",
) -> None: ...
def __iter__(self) -> Iterator[WarcRecord]: ...
def __next__(self) -> WarcRecord: ...

0 comments on commit ea7dceb

Please sign in to comment.