Fix code format, LINT, typing, and tests

webis-de · Nov 2, 2023 · 8f669a9 · 8f669a9
1 parent 96136bb
commit 8f669a9
Show file tree

Hide file tree

Showing 9 changed files with 34 additions and 31 deletions.
diff --git a/archive_query_log/legacy/download/iterable.py b/archive_query_log/legacy/download/iterable.py
@@ -1,6 +1,5 @@
 from dataclasses import dataclass
 from functools import cached_property
-from gzip import open as gzip_open
 from json import JSONDecodeError
 from pathlib import Path
 from typing import Sized, Iterable, Iterator, IO
@@ -36,17 +35,14 @@ def _check_raw_serps_paths(self):
     def _streams(self) -> Iterator[tuple[Path, IO[bytes]]]:
         files = self.path.glob("*.warc.gz")
         for file in files:
-            with gzip_open(file, "rb") as stream:
+            with file.open( "rb") as stream:
                 yield file, stream
 
     def __len__(self) -> int:
         return sum(
             1
             for _, stream in self._streams()
-            for record in ArchiveIterator(
-                stream,
-                no_record_parse=True,
-            )
+            for record in ArchiveIterator(stream, no_record_parse=True)
             if record.rec_type == "response"
         )
 
@@ -73,6 +69,7 @@ def _read_serp_content(
         content_type = record.http_headers.get_header("Content-Type")
         if content_type is None:
             content_type = "utf8"
+        print(record_url_header)
         return ArchivedRawSerp(
             url=archived_serp_url.url,
             timestamp=archived_serp_url.timestamp,

diff --git a/archive_query_log/legacy/results/test/test_ask_serp_parsing.py b/archive_query_log/legacy/results/test/test_ask_serp_parsing.py
@@ -1,6 +1,6 @@
 # flake8: noqa
 # This file is auto-generated by generate_tests.py.
-from archive_query_log.results.test.test_utils import verify_serp_parsing
+from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing
 
 
 def test_parse_query_ask_peter_krogh_photographer_1184320758():

diff --git a/archive_query_log/legacy/results/test/test_brave_serp_parsing.py b/archive_query_log/legacy/results/test/test_brave_serp_parsing.py
@@ -1,6 +1,6 @@
 # flake8: noqa
 # This file is auto-generated by generate_tests.py.
-from archive_query_log.results.test.test_utils import verify_serp_parsing
+from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing
 
 
 def test_parse_query_brave_chomikuj_1656776694():

diff --git a/archive_query_log/legacy/results/test/test_chefkoch_serp_parsing.py b/archive_query_log/legacy/results/test/test_chefkoch_serp_parsing.py
@@ -1,6 +1,6 @@
 # flake8: noqa
 # This file is auto-generated by generate_tests.py.
-from archive_query_log.results.test.test_utils import verify_serp_parsing
+from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing
 
 
 def test_parse_query_chefkoch_spaghetti_eis_torte_1342866905():

diff --git a/archive_query_log/legacy/results/test/test_duckduckgo_serp_parsing.py b/archive_query_log/legacy/results/test/test_duckduckgo_serp_parsing.py
@@ -1,6 +1,6 @@
 # flake8: noqa
 # This file is auto-generated by generate_tests.py.
-from archive_query_log.results.test.test_utils import verify_serp_parsing
+from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing
 
 
 def test_parse_query_duckduckgo_3rd_party_twitch_chat_1642095474():

diff --git a/archive_query_log/legacy/results/test/test_ecosia_serp_parsing.py b/archive_query_log/legacy/results/test/test_ecosia_serp_parsing.py
@@ -1,6 +1,6 @@
 # flake8: noqa
 # This file is auto-generated by generate_tests.py.
-from archive_query_log.results.test.test_utils import verify_serp_parsing
+from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing
 
 
 def test_parse_query_ecosia_financial_risk_tolerance_quiz_1643759873():

diff --git a/archive_query_log/legacy/results/test/test_qwant_serp_parsing.py b/archive_query_log/legacy/results/test/test_qwant_serp_parsing.py
@@ -1,6 +1,6 @@
 # flake8: noqa
 # This file is auto-generated by generate_tests.py.
-from archive_query_log.results.test.test_utils import verify_serp_parsing
+from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing
 
 
 def test_parse_query_qwant_administrateur_general_du_cnam_1619206522():

diff --git a/archive_query_log/memento.py b/archive_query_log/memento.py
@@ -34,28 +34,26 @@ class MementoApi:
     @overload
     def load_capture(
             self,
-            url: str,
-            timestamp: datetime | None = ...,
+            url_or_cdx_capture: str,
+            timestamp: datetime | None = None,
     ) -> Response:
         """
         Load a captured document from the Memento API.
-        :param url: The original URL of the document.
+        :param url_or_cdx_capture: The original URL of the document.
         :param timestamp: Timestamp of the capture.
         :return: HTTP response.
         """
-        pass
 
     @overload
     def load_capture(
             self,
-            cdx_capture: CdxCapture,
+            url_or_cdx_capture: CdxCapture,
     ) -> Response:
         """
         Load a captured document from the Memento API.
-        :param cdx_capture: The CDX record describing the capture.
+        :param url_or_cdx_capture: The CDX record describing the capture.
         :return: HTTP response.
         """
-        pass
 
     def load_capture(
             self,
@@ -69,6 +67,13 @@ def load_capture(
         :param timestamp: Timestamp of the capture.
         :return: HTTP response.
         """
+        return self._load_capture(url_or_cdx_capture, timestamp)
+
+    def _load_capture(
+            self,
+            url_or_cdx_capture: str | CdxCapture,
+            timestamp: datetime | None,
+    ) -> Response:
         if not (isinstance(url_or_cdx_capture, str) or
                 isinstance(url_or_cdx_capture, CdxCapture)):
             raise TypeError("URL must be a string or CdxCapture.")
@@ -91,30 +96,28 @@ def load_capture(
     @overload
     def load_capture_warc(
             self,
-            url: str,
-            timestamp: datetime | None = ...,
+            url_or_cdx_capture: str,
+            timestamp: datetime | None = None,
     ) -> Iterator[ArcWarcRecord]:
         """
         Load a captured document from the Memento API and
         capture the HTTP request and response as WARC records.
-        :param url: The original URL of the document.
+        :param url_or_cdx_capture: The original URL of the document.
         :param timestamp: Timestamp of the capture.
         :return: Iterator over request and response WARC records.
         """
-        pass
 
     @overload
     def load_capture_warc(
             self,
-            cdx_capture: CdxCapture,
+            url_or_cdx_capture: CdxCapture,
     ) -> Iterator[ArcWarcRecord]:
         """
         Load a captured document from the Memento API and
         capture the HTTP request and response as WARC records.
-        :param cdx_capture: The CDX record describing the capture.
+        :param url_or_cdx_capture: The CDX record describing the capture.
         :return: Iterator over request and response WARC records.
         """
-        pass
 
     def load_capture_warc(
             self,
@@ -130,5 +133,5 @@ def load_capture_warc(
         :return: Iterator over request and response WARC records.
         """
         with capture_http() as writer:
-            self.load_capture(url_or_cdx_capture, timestamp)
+            self._load_capture(url_or_cdx_capture, timestamp)
             yield from ArchiveIterator(writer.get_stream())
diff --git a/archive_query_log/warc_s3.py b/archive_query_log/warc_s3.py
@@ -6,7 +6,7 @@
 from itertools import chain
 from pathlib import Path
 from tempfile import TemporaryFile
-from typing import IO, NamedTuple, Iterable, Iterator, ContextManager
+from typing import IO, NamedTuple, Iterable, Iterator
 from uuid import uuid4
 from warnings import warn
 
@@ -166,7 +166,7 @@ def write(self, records: Iterable[WarcRecord]) -> Iterator[WarcS3Record]:
                 # Find next available key.
                 key: str = f"{uuid4().hex}.warc.gz"
                 while self._exists_object(key):
-                    key: str = f"{uuid4().hex}.warc.gz"
+                    key = f"{uuid4().hex}.warc.gz"
 
                 # Write records to buffer.
                 offset_records: Iterable[_WarcS3Record] = _write_records(
@@ -185,7 +185,8 @@ def write(self, records: Iterable[WarcRecord]) -> Iterator[WarcS3Record]:
                     lambda record: record.location is not None,
                     offset_records,
                 )
-                saved_records = list(saved_records)
+                # Consume iterator to write records to buffer.
+                saved_records = iter(list(saved_records))
                 tmp_file.flush()
                 tmp_file.seek(0)
 
@@ -199,6 +200,8 @@ def write(self, records: Iterable[WarcRecord]) -> Iterator[WarcS3Record]:
                     Key=key,
                 )
             for offset_record in saved_records:
+                if offset_record.location is None:
+                    raise RuntimeError("Expected location to be set.")
                 yield WarcS3Record(
                     record=offset_record.record,
                     location=offset_record.location,
@@ -210,7 +213,7 @@ def write(self, records: Iterable[WarcRecord]) -> Iterator[WarcS3Record]:
             head, records = spy(records)
 
     @contextmanager
-    def read(self, location: WarcS3Location) -> ContextManager[WarcRecord]:
+    def read(self, location: WarcS3Location) -> Iterator[WarcRecord]:
         end_offset = location.offset + location.length - 1
         response = self.client.get_object(
             Bucket=self.bucket_name,