Add explicit __all__ to modules

chatnoir-eu · Aug 9, 2024 · 774e33f · 774e33f
1 parent 360a19c
commit 774e33f
Show file tree

Hide file tree

Showing 17 changed files with 152 additions and 0 deletions.
diff --git a/fastwarc/fastwarc/stream_io.pyx b/fastwarc/fastwarc/stream_io.pyx
@@ -25,6 +25,22 @@ from resiliparse_inc.stdio cimport fclose, ferror, fflush, fopen, fread, fseek,
 
 import brotli
 
+__all__ = [
+    'BufferedReader',
+    'BrotliStream',
+    'BytesIOStream',
+    'CompressingStream',
+    'FastWARCError',
+    'GZipStream',
+    'IOStream',
+    'FileStream',
+    'LZ4Stream',
+    'PythonIOStreamAdapter',
+    'ReaderStaleError',
+    'StreamError',
+    'wrap_stream',
+]
+
 
 class FastWARCError(Exception):
     """Generic FastWARC exception."""

diff --git a/fastwarc/fastwarc/tools.pyx b/fastwarc/fastwarc/tools.pyx
@@ -18,6 +18,15 @@ from fastwarc.stream_io cimport IOStream, GZipStream, LZ4Stream, FileStream, Pyt
 from fastwarc.stream_io import StreamError
 from fastwarc.warc cimport ArchiveIterator, WarcRecordType
 
+__all__ = [
+    'CompressionAlg',
+    'detect_compression_algorithm',
+    'recompress_warc',
+    'recompress_warc_interactive',
+    'verify_digests',
+    'wrap_warc_stream',
+]
+
 
 cpdef enum CompressionAlg:
     gzip,

diff --git a/fastwarc/fastwarc/warc.pyx b/fastwarc/fastwarc/warc.pyx
@@ -40,6 +40,34 @@ from fastwarc.stream_io cimport BufferedReader, BytesIOStream, CompressingStream
     IOStream, PythonIOStreamAdapter
 from fastwarc.stream_io import ReaderStaleError
 
+__all__ = [
+    'ArchiveIterator',
+    'CaseInsensitiveStr',
+    'CaseInsensitiveStrDict',
+    'WarcHeaderMap',
+    'WarcRecord',
+    'is_http',
+    'is_concurrent',
+    'is_warc_10',
+    'is_warc_11',
+    'has_block_digest',
+    'has_payload_digest',
+
+    'WarcRecordType',
+    'warcinfo',
+    'response',
+    'resource',
+    'request',
+    'metadata',
+    'revisit',
+    'conversion',
+    'continuation',
+    'unknown',
+    'any_type',
+    'no_type',
+    'any_type',
+]
+
 
 cdef const char* _enum_record_type_to_str(WarcRecordType record_type) noexcept nogil:
     if record_type == warcinfo:

diff --git a/resiliparse/resiliparse/beam/__init__.py b/resiliparse/resiliparse/beam/__init__.py
@@ -16,3 +16,11 @@
     import apache_beam
 except ModuleNotFoundError:
     raise ModuleNotFoundError('Missing dependency: apache_beam')
+
+__all__ = [
+    'coders',
+    'elasticsearch',
+    'fileio',
+    'textio',
+    'warcio'
+]
diff --git a/resiliparse/resiliparse/beam/coders.py b/resiliparse/resiliparse/beam/coders.py
@@ -15,6 +15,10 @@
 from apache_beam.coders import StrUtf8Coder as _StrUtf8Coder
 from resiliparse.parse.encoding import bytes_to_str
 
+__all__ = [
+    'StrUtf8Coder'
+]
+
 
 class StrUtf8Coder(_StrUtf8Coder):
     """

diff --git a/resiliparse/resiliparse/beam/elasticsearch.py b/resiliparse/resiliparse/beam/elasticsearch.py
@@ -21,6 +21,14 @@
 from elasticsearch import exceptions as es_exc, Elasticsearch
 from elasticsearch.helpers import BulkIndexError, streaming_bulk
 
+__all__ = [
+    'ElasticsearchBulkIndex',
+    'delete_action',
+    'ensure_index',
+    'index_action',
+    'update_action'
+]
+
 
 logger = logging.getLogger()
 

diff --git a/resiliparse/resiliparse/beam/fileio.py b/resiliparse/resiliparse/beam/fileio.py
@@ -15,6 +15,10 @@
 import apache_beam as beam
 from apache_beam.io import fileio as beam_fio
 
+__all__ = [
+    'MatchFiles'
+]
+
 
 class MatchFiles(beam.PTransform):
     """

diff --git a/resiliparse/resiliparse/beam/textio.py b/resiliparse/resiliparse/beam/textio.py
@@ -22,6 +22,11 @@
 from resiliparse.beam.coders import StrUtf8Coder
 from resiliparse.beam.fileio import MatchFiles
 
+__all__ = [
+    'ReadAllFromText',
+    'ReadFromText',
+]
+
 
 DEFAULT_DESIRED_SPLIT_SIZE = 64 * 1024 * 1024   # 64 MiB
 DEFAULT_MIN_SPLIT_SIZE = 1024 * 1024            # 1 MiB

diff --git a/resiliparse/resiliparse/beam/warcio.py b/resiliparse/resiliparse/beam/warcio.py
@@ -34,6 +34,11 @@
 
 logger = logging.getLogger()
 
+__all__ = [
+    'ReadAllWarcs',
+    "ReadWarcs",
+]
+
 
 class ReadWarcs(beam.PTransform):
     """

diff --git a/resiliparse/resiliparse/extract/html2text.pyx b/resiliparse/resiliparse/extract/html2text.pyx
@@ -30,6 +30,11 @@ from resiliparse_inc.string_view cimport string_view
 from resiliparse_inc.utility cimport move
 
 
+__all__ = [
+    'extract_plain_text',
+]
+
+
 cdef extern from * nogil:
     """
     struct ExtractOpts {

diff --git a/resiliparse/resiliparse/itertools.pyx b/resiliparse/resiliparse/itertools.pyx
@@ -17,6 +17,11 @@
 import typing as t
 from fastwarc.warc cimport ArchiveIterator
 
+__all__ = [
+    'progress_loop',
+    'exc_loop',
+    'warc_retry',
+]
 
 def progress_loop(it, ctx=None):
     """

diff --git a/resiliparse/resiliparse/parse/encoding.pyx b/resiliparse/resiliparse/parse/encoding.pyx
@@ -27,6 +27,13 @@ from resiliparse_inc.lexbor cimport lxb_char_t, lxb_status_t, lxb_html_encoding_
 from resiliparse_inc.uchardet cimport uchardet_new, uchardet_delete, uchardet_handle_data, \
     uchardet_data_end, uchardet_reset, uchardet_get_charset
 
+__all__ = [
+    'EncodingDetector',
+    'bytes_to_str',
+    'detect_encoding',
+    'detect_mime',
+    'map_encoding_to_html5',
+]
 
 # Encoding name and label map according to https://encoding.spec.whatwg.org/#names-and-labels
 # Differences:

diff --git a/resiliparse/resiliparse/parse/html.pyx b/resiliparse/resiliparse/parse/html.pyx
@@ -24,6 +24,30 @@ from libcpp.set cimport set as unordered_set
 from resiliparse_inc.lexbor cimport *
 from resiliparse.parse.encoding cimport bytes_to_str, map_encoding_to_html5
 
+__all__ = [
+    'DOMCollection',
+    'DOMContext',
+    'DOMElementClassList',
+    'DOMNode',
+    'HTMLTree',
+    'traverse_dom',
+
+    'NodeType',
+    'ELEMENT',
+    'ATTRIBUTE',
+    'TEXT',
+    'CDATA_SECTION',
+    'ENTITY_REFERENCE',
+    'ENTITY',
+    'PROCESSING_INSTRUCTION',
+    'COMMENT',
+    'DOCUMENT',
+    'DOCUMENT_TYPE',
+    'DOCUMENT_FRAGMENT',
+    'NOTATION',
+    'LAST_ENTRY',
+]
+
 
 cdef inline DOMNode _create_dom_node(HTMLTree tree, lxb_dom_node_t* dom_node):
     if not dom_node:

diff --git a/resiliparse/resiliparse/parse/http.pyx b/resiliparse/resiliparse/parse/http.pyx
@@ -19,6 +19,11 @@ import typing as t
 from libcpp.string cimport string
 from resiliparse_inc.cstdlib cimport strtol
 
+__all__ = [
+    'iterate_http_chunks',
+    'read_http_chunk',
+]
+
 
 cpdef bytes read_http_chunk(reader):
     """

diff --git a/resiliparse/resiliparse/parse/lang.pyx b/resiliparse/resiliparse/parse/lang.pyx
@@ -21,6 +21,12 @@ cimport cython
 from cython.operator cimport preincrement as preinc
 from cpython.unicode cimport Py_UNICODE_ISALPHA, Py_UNICODE_ISSPACE
 
+__all__ = [
+    'detect_fast',
+    'supported_langs',
+    'train_language_examples'
+]
+
 
 @cython.wraparound(False)
 cdef lang_vec8_t str_to_vec(str train_text, size_t vec_len=LANG_VEC_SIZE):

diff --git a/resiliparse/resiliparse/process_guard.pyx b/resiliparse/resiliparse/process_guard.pyx
@@ -31,6 +31,17 @@ from resiliparse_inc.stdio cimport FILE, fclose, feof, fgets, fopen
 from resiliparse_inc.time cimport timespec, clock_gettime, CLOCK_MONOTONIC
 from resiliparse_inc.unistd cimport getpagesize, getpid, usleep
 
+__all__ = [
+    'mem_guard',
+    'progress',
+    'progress_loop',
+    'time_guard',
+    'ExecutionTimeout',
+    'ResiliparseGuardException',
+    'MemGuard',
+    'MemoryLimitExceeded',
+    'TimeGuard',
+]
 
 class ResiliparseGuardException(BaseException):
     """Resiliparse guard base exception."""

diff --git a/tests/fastwarc/test_warc.py b/tests/fastwarc/test_warc.py
@@ -1,8 +1,10 @@
 from base64 import b32encode
 import brotli
+import codecs
 import datetime
 from email.utils import format_datetime
 import gzip
+import hashlib
 import lz4.frame
 import io
 import os