Skip to content

Commit

Permalink
Stub files for fastwarc
Browse files Browse the repository at this point in the history
  • Loading branch information
habibutsu authored and phoerious committed Aug 8, 2024
1 parent 0826481 commit d19eeb4
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 0 deletions.
72 changes: 72 additions & 0 deletions fastwarc/fastwarc/stream_io.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from typing import ContextManager, IO


class IOStream(ContextManager):
def read(self, size: int) -> bytes: ...
def write(self, data: bytes) -> int: ...
def close(self) -> None: ...
def flush(self) -> None: ...
def seek(self, offset: int) -> None: ...
def tell(self) -> int: ...


class BufferedReader:
def __init__(
self, stream: IOStream, buf_size: int = 8192, negotiate_stream: bool = True
) -> None: ...
def close(self) -> None: ...
def consume(self, size: int = -1) -> int: ...
def read(self, size: int = -1) -> bytes: ...
def readline(self, crlf: bool = True, max_line_len: int = 8192) -> bytes: ...
def tell(self) -> int: ...


class BytesIOStream(IOStream):
def getvalue(self) -> bytes: ...


class FileStream(IOStream):
def __init__(self, filename: str, mode: str = "rb") -> None: ...


class CompressingStream(IOStream):
def begin_member(self) -> int: ...
def end_member(self) -> int: ...


class BrotliStream(CompressingStream):
def __init__(
self, raw_stream: IOStream, quality: int = 11, lgwin: int = 22, lgblock: int = 0
) -> None: ...


class GZipStream(CompressingStream):
def __init__(
self, raw_stream: IOStream, compression_level: int = 9, zlib: bool = False
) -> None: ...


class LZ4Stream(CompressingStream):
def __init__(
self,
raw_stream: IOStream,
compression_level: int = 12,
favor_dec_speed: bool = True,
) -> None: ...
def prepopulate(self, initial_data: bytes) -> None: ...


class PythonIOStreamAdapter(IOStream):
def __init__(self, py_stream: IO) -> None: ...


class FastWARCError(Exception):
pass


class ReaderStaleError(FastWARCError):
pass


class StreamError(FastWARCError):
pass
48 changes: 48 additions & 0 deletions fastwarc/fastwarc/tools.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from enum import IntFlag
from typing import Union, Type, Iterator, Tuple

from .stream_io import IOStream
from .warc import WarcRecord


class CompressionAlg(IntFlag):
gzip = 0
lz4 = 1
uncompressed = 2
auto = 3


def detect_compression_algorithm(file: str) -> CompressionAlg: ...


def wrap_warc_stream(
file: Union[str, Type[IOStream]],
mode: str,
comp_alg: CompressionAlg = CompressionAlg.auto,
**comp_args
) -> Type[IOStream]: ...


def recompress_warc_interactive(
warc_in: Union[str, Type[IOStream]],
warc_out: Union[str, Type[IOStream]],
comp_alg_in: CompressionAlg = CompressionAlg.auto,
comp_alg_out: CompressionAlg = CompressionAlg.auto,
**comp_args
) -> Iterator[Tuple[WarcRecord, int]]: ...


def recompress_warc(
warc_in: Union[str, Type[IOStream]],
warc_out: Union[str, Type[IOStream]],
comp_alg_in: CompressionAlg = CompressionAlg.auto,
comp_alg_out: CompressionAlg = CompressionAlg.auto,
**comp_args
) -> Iterator[Tuple[WarcRecord, int]]: ...


def verify_digests(
warc_in: Union[str, Type[IOStream]],
verify_payloads: bool = False,
comp_alg: CompressionAlg = CompressionAlg.auto,
) -> bool: ...
90 changes: 90 additions & 0 deletions fastwarc/fastwarc/warc.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from datetime import datetime
from typing import (
Optional,
Iterator,
Dict,
Tuple,
MutableMapping,
Iterable,
ValuesView,
KeysView,
Type,
)
from enum import IntFlag

from .stream_io import BufferedReader, IOStream


class WarcRecordType(IntFlag):
warcinfo = 2
response = 4
resource = 8
request = 16
metadata = 32
revisit = 64
conversion = 128
continuation = 256
unknown = 512
any_type = 65535
no_type = 0


no_type = WarcRecordType.no_type
any_type = WarcRecordType.any_type


class WarcHeaderMap(MutableMapping[str, str]):
reason_phrase: Optional[str]
status_code: Optional[str]
status_line: str

def append(self, key: str, value: str) -> None: ...
def asdict(self) -> Dict[str, str]: ...
def astuples(self) -> Tuple[str, str]: ...
def clear(self) -> None: ...
def get(self, key: str, default: Optional[str] = None) -> Optional[str]: ...
def items(self) -> Iterator[Tuple[str, str]]: ...
def keys(self) -> KeysView[str]: ...
def values(self) -> ValuesView[str]: ...
def write(self, stream: IOStream) -> None: ...


class WarcRecord:
record_id: str
record_type: WarcRecordType
content_length: int
record_date: Optional[datetime]
headers: WarcHeaderMap
is_http: bool
is_http_parsed: bool
http_headers: Optional[WarcHeaderMap]
http_content_type: Optional[str]
http_content_type: Optional[str]
http_charset: Optional[str]
http_date: Optional[datetime]
http_last_modified: Optional[datetime]
content_length: int
reader: BufferedReader
stream_pos: int

def init_headers(
self, content_length: int = 0, record_type=no_type, record_urn=None
): ...
def freeze(self) -> bool: ...
def set_bytes_content(self, content: bytes) -> None: ...
def parse_http(self, strict_mode=True, auto_decode: str = "none") -> None: ...
def verify_block_digest(self, consume: bool = False) -> bool: ...
def verify_payload_digest(self, consume: bool = False) -> bool: ...


class ArchiveIterator(Iterable[WarcRecord]):
def __init__(
self,
stream: Type[IOStream],
record_types: WarcRecordType = any_type,
parse_http: bool = True,
min_content_length: int = -1,
max_content_length: int = -1,
) -> None: ...
def __iter__(self) -> Iterator[WarcRecord]: ...
def __next__(self) -> WarcRecord: ...
3 changes: 3 additions & 0 deletions fastwarc/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ test = [
[tool.setuptools.packages.find]
include = ["fastwarc*"]

[tool.setuptools.package-data]
"*" = ["*.pyi"]

[tool.cibuildwheel]
archs = "native"
build = "cp3*"
Expand Down

0 comments on commit d19eeb4

Please sign in to comment.