Skip to content

Commit

Permalink
Fix code format, LINT, typing, and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
janheinrichmerker committed Nov 2, 2023
1 parent 96136bb commit 8f669a9
Show file tree
Hide file tree
Showing 9 changed files with 34 additions and 31 deletions.
9 changes: 3 additions & 6 deletions archive_query_log/legacy/download/iterable.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from dataclasses import dataclass
from functools import cached_property
from gzip import open as gzip_open
from json import JSONDecodeError
from pathlib import Path
from typing import Sized, Iterable, Iterator, IO
Expand Down Expand Up @@ -36,17 +35,14 @@ def _check_raw_serps_paths(self):
def _streams(self) -> Iterator[tuple[Path, IO[bytes]]]:
files = self.path.glob("*.warc.gz")
for file in files:
with gzip_open(file, "rb") as stream:
with file.open( "rb") as stream:
yield file, stream

def __len__(self) -> int:
return sum(
1
for _, stream in self._streams()
for record in ArchiveIterator(
stream,
no_record_parse=True,
)
for record in ArchiveIterator(stream, no_record_parse=True)
if record.rec_type == "response"
)

Expand All @@ -73,6 +69,7 @@ def _read_serp_content(
content_type = record.http_headers.get_header("Content-Type")
if content_type is None:
content_type = "utf8"
print(record_url_header)
return ArchivedRawSerp(
url=archived_serp_url.url,
timestamp=archived_serp_url.timestamp,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing
from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing


def test_parse_query_ask_peter_krogh_photographer_1184320758():
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing
from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing


def test_parse_query_brave_chomikuj_1656776694():
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing
from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing


def test_parse_query_chefkoch_spaghetti_eis_torte_1342866905():
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing
from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing


def test_parse_query_duckduckgo_3rd_party_twitch_chat_1642095474():
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing
from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing


def test_parse_query_ecosia_financial_risk_tolerance_quiz_1643759873():
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing
from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing


def test_parse_query_qwant_administrateur_general_du_cnam_1619206522():
Expand Down
33 changes: 18 additions & 15 deletions archive_query_log/memento.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,28 +34,26 @@ class MementoApi:
@overload
def load_capture(
self,
url: str,
timestamp: datetime | None = ...,
url_or_cdx_capture: str,
timestamp: datetime | None = None,
) -> Response:
"""
Load a captured document from the Memento API.
:param url: The original URL of the document.
:param url_or_cdx_capture: The original URL of the document.
:param timestamp: Timestamp of the capture.
:return: HTTP response.
"""
pass

@overload
def load_capture(
self,
cdx_capture: CdxCapture,
url_or_cdx_capture: CdxCapture,
) -> Response:
"""
Load a captured document from the Memento API.
:param cdx_capture: The CDX record describing the capture.
:param url_or_cdx_capture: The CDX record describing the capture.
:return: HTTP response.
"""
pass

def load_capture(
self,
Expand All @@ -69,6 +67,13 @@ def load_capture(
:param timestamp: Timestamp of the capture.
:return: HTTP response.
"""
return self._load_capture(url_or_cdx_capture, timestamp)

def _load_capture(
self,
url_or_cdx_capture: str | CdxCapture,
timestamp: datetime | None,
) -> Response:
if not (isinstance(url_or_cdx_capture, str) or
isinstance(url_or_cdx_capture, CdxCapture)):
raise TypeError("URL must be a string or CdxCapture.")
Expand All @@ -91,30 +96,28 @@ def load_capture(
@overload
def load_capture_warc(
self,
url: str,
timestamp: datetime | None = ...,
url_or_cdx_capture: str,
timestamp: datetime | None = None,
) -> Iterator[ArcWarcRecord]:
"""
Load a captured document from the Memento API and
capture the HTTP request and response as WARC records.
:param url: The original URL of the document.
:param url_or_cdx_capture: The original URL of the document.
:param timestamp: Timestamp of the capture.
:return: Iterator over request and response WARC records.
"""
pass

@overload
def load_capture_warc(
self,
cdx_capture: CdxCapture,
url_or_cdx_capture: CdxCapture,
) -> Iterator[ArcWarcRecord]:
"""
Load a captured document from the Memento API and
capture the HTTP request and response as WARC records.
:param cdx_capture: The CDX record describing the capture.
:param url_or_cdx_capture: The CDX record describing the capture.
:return: Iterator over request and response WARC records.
"""
pass

def load_capture_warc(
self,
Expand All @@ -130,5 +133,5 @@ def load_capture_warc(
:return: Iterator over request and response WARC records.
"""
with capture_http() as writer:
self.load_capture(url_or_cdx_capture, timestamp)
self._load_capture(url_or_cdx_capture, timestamp)
yield from ArchiveIterator(writer.get_stream())
11 changes: 7 additions & 4 deletions archive_query_log/warc_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from itertools import chain
from pathlib import Path
from tempfile import TemporaryFile
from typing import IO, NamedTuple, Iterable, Iterator, ContextManager
from typing import IO, NamedTuple, Iterable, Iterator
from uuid import uuid4
from warnings import warn

Expand Down Expand Up @@ -166,7 +166,7 @@ def write(self, records: Iterable[WarcRecord]) -> Iterator[WarcS3Record]:
# Find next available key.
key: str = f"{uuid4().hex}.warc.gz"
while self._exists_object(key):
key: str = f"{uuid4().hex}.warc.gz"
key = f"{uuid4().hex}.warc.gz"

# Write records to buffer.
offset_records: Iterable[_WarcS3Record] = _write_records(
Expand All @@ -185,7 +185,8 @@ def write(self, records: Iterable[WarcRecord]) -> Iterator[WarcS3Record]:
lambda record: record.location is not None,
offset_records,
)
saved_records = list(saved_records)
# Consume iterator to write records to buffer.
saved_records = iter(list(saved_records))
tmp_file.flush()
tmp_file.seek(0)

Expand All @@ -199,6 +200,8 @@ def write(self, records: Iterable[WarcRecord]) -> Iterator[WarcS3Record]:
Key=key,
)
for offset_record in saved_records:
if offset_record.location is None:
raise RuntimeError("Expected location to be set.")
yield WarcS3Record(
record=offset_record.record,
location=offset_record.location,
Expand All @@ -210,7 +213,7 @@ def write(self, records: Iterable[WarcRecord]) -> Iterator[WarcS3Record]:
head, records = spy(records)

@contextmanager
def read(self, location: WarcS3Location) -> ContextManager[WarcRecord]:
def read(self, location: WarcS3Location) -> Iterator[WarcRecord]:
end_offset = location.offset + location.length - 1
response = self.client.get_object(
Bucket=self.bucket_name,
Expand Down

0 comments on commit 8f669a9

Please sign in to comment.