Skip to content

Commit

Permalink
Add explicit __all__ to modules
Browse files Browse the repository at this point in the history
  • Loading branch information
phoerious committed Aug 9, 2024
1 parent 360a19c commit 774e33f
Show file tree
Hide file tree
Showing 17 changed files with 152 additions and 0 deletions.
16 changes: 16 additions & 0 deletions fastwarc/fastwarc/stream_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,22 @@ from resiliparse_inc.stdio cimport fclose, ferror, fflush, fopen, fread, fseek,

import brotli

__all__ = [
'BufferedReader',
'BrotliStream',
'BytesIOStream',
'CompressingStream',
'FastWARCError',
'GZipStream',
'IOStream',
'FileStream',
'LZ4Stream',
'PythonIOStreamAdapter',
'ReaderStaleError',
'StreamError',
'wrap_stream',
]


class FastWARCError(Exception):
"""Generic FastWARC exception."""
Expand Down
9 changes: 9 additions & 0 deletions fastwarc/fastwarc/tools.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,15 @@ from fastwarc.stream_io cimport IOStream, GZipStream, LZ4Stream, FileStream, Pyt
from fastwarc.stream_io import StreamError
from fastwarc.warc cimport ArchiveIterator, WarcRecordType

__all__ = [
'CompressionAlg',
'detect_compression_algorithm',
'recompress_warc',
'recompress_warc_interactive',
'verify_digests',
'wrap_warc_stream',
]


cpdef enum CompressionAlg:
gzip,
Expand Down
28 changes: 28 additions & 0 deletions fastwarc/fastwarc/warc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,34 @@ from fastwarc.stream_io cimport BufferedReader, BytesIOStream, CompressingStream
IOStream, PythonIOStreamAdapter
from fastwarc.stream_io import ReaderStaleError

__all__ = [
'ArchiveIterator',
'CaseInsensitiveStr',
'CaseInsensitiveStrDict',
'WarcHeaderMap',
'WarcRecord',
'is_http',
'is_concurrent',
'is_warc_10',
'is_warc_11',
'has_block_digest',
'has_payload_digest',

'WarcRecordType',
'warcinfo',
'response',
'resource',
'request',
'metadata',
'revisit',
'conversion',
'continuation',
'unknown',
'any_type',
'no_type',
'any_type',
]


cdef const char* _enum_record_type_to_str(WarcRecordType record_type) noexcept nogil:
if record_type == warcinfo:
Expand Down
8 changes: 8 additions & 0 deletions resiliparse/resiliparse/beam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,11 @@
import apache_beam
except ModuleNotFoundError:
raise ModuleNotFoundError('Missing dependency: apache_beam')

__all__ = [
'coders',
'elasticsearch',
'fileio',
'textio',
'warcio'
]
4 changes: 4 additions & 0 deletions resiliparse/resiliparse/beam/coders.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
from apache_beam.coders import StrUtf8Coder as _StrUtf8Coder
from resiliparse.parse.encoding import bytes_to_str

__all__ = [
'StrUtf8Coder'
]


class StrUtf8Coder(_StrUtf8Coder):
"""
Expand Down
8 changes: 8 additions & 0 deletions resiliparse/resiliparse/beam/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@
from elasticsearch import exceptions as es_exc, Elasticsearch
from elasticsearch.helpers import BulkIndexError, streaming_bulk

__all__ = [
'ElasticsearchBulkIndex',
'delete_action',
'ensure_index',
'index_action',
'update_action'
]


logger = logging.getLogger()

Expand Down
4 changes: 4 additions & 0 deletions resiliparse/resiliparse/beam/fileio.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
import apache_beam as beam
from apache_beam.io import fileio as beam_fio

__all__ = [
'MatchFiles'
]


class MatchFiles(beam.PTransform):
"""
Expand Down
5 changes: 5 additions & 0 deletions resiliparse/resiliparse/beam/textio.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@
from resiliparse.beam.coders import StrUtf8Coder
from resiliparse.beam.fileio import MatchFiles

__all__ = [
'ReadAllFromText',
'ReadFromText',
]


DEFAULT_DESIRED_SPLIT_SIZE = 64 * 1024 * 1024 # 64 MiB
DEFAULT_MIN_SPLIT_SIZE = 1024 * 1024 # 1 MiB
Expand Down
5 changes: 5 additions & 0 deletions resiliparse/resiliparse/beam/warcio.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@

logger = logging.getLogger()

__all__ = [
'ReadAllWarcs',
"ReadWarcs",
]


class ReadWarcs(beam.PTransform):
"""
Expand Down
5 changes: 5 additions & 0 deletions resiliparse/resiliparse/extract/html2text.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ from resiliparse_inc.string_view cimport string_view
from resiliparse_inc.utility cimport move


__all__ = [
'extract_plain_text',
]


cdef extern from * nogil:
"""
struct ExtractOpts {
Expand Down
5 changes: 5 additions & 0 deletions resiliparse/resiliparse/itertools.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
import typing as t
from fastwarc.warc cimport ArchiveIterator

__all__ = [
'progress_loop',
'exc_loop',
'warc_retry',
]

def progress_loop(it, ctx=None):
"""
Expand Down
7 changes: 7 additions & 0 deletions resiliparse/resiliparse/parse/encoding.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ from resiliparse_inc.lexbor cimport lxb_char_t, lxb_status_t, lxb_html_encoding_
from resiliparse_inc.uchardet cimport uchardet_new, uchardet_delete, uchardet_handle_data, \
uchardet_data_end, uchardet_reset, uchardet_get_charset

__all__ = [
'EncodingDetector',
'bytes_to_str',
'detect_encoding',
'detect_mime',
'map_encoding_to_html5',
]

# Encoding name and label map according to https://encoding.spec.whatwg.org/#names-and-labels
# Differences:
Expand Down
24 changes: 24 additions & 0 deletions resiliparse/resiliparse/parse/html.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,30 @@ from libcpp.set cimport set as unordered_set
from resiliparse_inc.lexbor cimport *
from resiliparse.parse.encoding cimport bytes_to_str, map_encoding_to_html5

__all__ = [
'DOMCollection',
'DOMContext',
'DOMElementClassList',
'DOMNode',
'HTMLTree',
'traverse_dom',

'NodeType',
'ELEMENT',
'ATTRIBUTE',
'TEXT',
'CDATA_SECTION',
'ENTITY_REFERENCE',
'ENTITY',
'PROCESSING_INSTRUCTION',
'COMMENT',
'DOCUMENT',
'DOCUMENT_TYPE',
'DOCUMENT_FRAGMENT',
'NOTATION',
'LAST_ENTRY',
]


cdef inline DOMNode _create_dom_node(HTMLTree tree, lxb_dom_node_t* dom_node):
if not dom_node:
Expand Down
5 changes: 5 additions & 0 deletions resiliparse/resiliparse/parse/http.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ import typing as t
from libcpp.string cimport string
from resiliparse_inc.cstdlib cimport strtol

__all__ = [
'iterate_http_chunks',
'read_http_chunk',
]


cpdef bytes read_http_chunk(reader):
"""
Expand Down
6 changes: 6 additions & 0 deletions resiliparse/resiliparse/parse/lang.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ cimport cython
from cython.operator cimport preincrement as preinc
from cpython.unicode cimport Py_UNICODE_ISALPHA, Py_UNICODE_ISSPACE

__all__ = [
'detect_fast',
'supported_langs',
'train_language_examples'
]


@cython.wraparound(False)
cdef lang_vec8_t str_to_vec(str train_text, size_t vec_len=LANG_VEC_SIZE):
Expand Down
11 changes: 11 additions & 0 deletions resiliparse/resiliparse/process_guard.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@ from resiliparse_inc.stdio cimport FILE, fclose, feof, fgets, fopen
from resiliparse_inc.time cimport timespec, clock_gettime, CLOCK_MONOTONIC
from resiliparse_inc.unistd cimport getpagesize, getpid, usleep

__all__ = [
'mem_guard',
'progress',
'progress_loop',
'time_guard',
'ExecutionTimeout',
'ResiliparseGuardException',
'MemGuard',
'MemoryLimitExceeded',
'TimeGuard',
]

class ResiliparseGuardException(BaseException):
"""Resiliparse guard base exception."""
Expand Down
2 changes: 2 additions & 0 deletions tests/fastwarc/test_warc.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from base64 import b32encode
import brotli
import codecs
import datetime
from email.utils import format_datetime
import gzip
import hashlib
import lz4.frame
import io
import os
Expand Down

0 comments on commit 774e33f

Please sign in to comment.