Skip to content

Commit

Permalink
feat!: add xsdata models
Browse files Browse the repository at this point in the history
  • Loading branch information
afuetterer committed Apr 17, 2024
1 parent 25ef4cb commit f1f1ad9
Show file tree
Hide file tree
Showing 19 changed files with 1,134 additions and 513 deletions.
5 changes: 3 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ repos:
- id: mypy
args: [--config-file=pyproject.toml]
additional_dependencies:
- httpx==0.26.0
- lxml-stubs==0.5.1
- httpx>=0.27.0
- lxml-stubs>=0.5.1
- xsdata>=24.4
exclude: tests

- repo: https://github.com/scientific-python/cookie
Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ dynamic = [
]
dependencies = [
"httpx>=0.25",
"lxml>=5.1",
"xsdata[cli,lxml]", # TODO: remove cli extra
]
[project.optional-dependencies]
dev = [
Expand Down Expand Up @@ -169,6 +169,9 @@ pydocstyle.convention = "google"
"src/oaipmh_scythe/client.py" = [
"PLR0913", # too-many-arguments
]
"src/oaipmh_scythe/models/*" = [
"D106",
]
"tests/*" = [
"D100", # undocumented-public-module
"D103", # undocumented-public-function
Expand Down
4 changes: 2 additions & 2 deletions src/oaipmh_scythe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
"""oaipmh-scythe: A Scythe for harvesting OAI-PMH repositories."""

from oaipmh_scythe.client import Scythe
from oaipmh_scythe.response import OAIResponse
from oaipmh_scythe.response import Response

__all__ = [
"Scythe",
"OAIResponse",
"Response",
]
48 changes: 19 additions & 29 deletions src/oaipmh_scythe/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@

from oaipmh_scythe.__about__ import __version__
from oaipmh_scythe.iterator import BaseOAIIterator, OAIItemIterator
from oaipmh_scythe.models import Header, Identify, MetadataFormat, OAIItem, Record, Set
from oaipmh_scythe.response import OAIResponse
from oaipmh_scythe.models import Header, Identify, MetadataFormat, Record, Set, Verb
from oaipmh_scythe.response import Response, _build_response
from oaipmh_scythe.utils import filter_dict_except_resumption_token, log_response, remove_none_values

if TYPE_CHECKING:
Expand All @@ -37,17 +37,6 @@
OAI_NAMESPACE: str = "{http://www.openarchives.org/OAI/2.0/}"


# Map OAI verbs to class representations
DEFAULT_CLASS_MAP = {
"GetRecord": Record,
"ListRecords": Record,
"ListIdentifiers": Header,
"ListSets": Set,
"ListMetadataFormats": MetadataFormat,
"Identify": Identify,
}


class Scythe:
"""A client for interacting with OAI-PMH interfaces, facilitating the harvesting of records, identifiers, and sets.
Expand Down Expand Up @@ -82,7 +71,6 @@ def __init__(
max_retries: int = 0,
retry_status_codes: Iterable[int] | None = None,
default_retry_after: int = 60,
class_mapping: dict[str, type[OAIItem]] | None = None,
encoding: str = "utf-8",
auth: AuthTypes | None = None,
timeout: int = 60,
Expand All @@ -99,7 +87,6 @@ def __init__(
self.retry_status_codes = retry_status_codes or (503,)
self.default_retry_after = default_retry_after
self.oai_namespace = OAI_NAMESPACE
self.class_mapping = class_mapping or DEFAULT_CLASS_MAP
self.encoding = encoding
self.auth = auth
self.timeout = timeout
Expand Down Expand Up @@ -149,7 +136,7 @@ def __exit__(
) -> None:
self.close()

def harvest(self, query: dict[str, str]) -> OAIResponse:
def harvest(self, query: dict[str, str]) -> Response:
"""Perform an HTTP request to the OAI server with the given parameters.
Send an OAI-PMH request to the server using the specified parameters. Handle retry logic
Expand All @@ -172,7 +159,7 @@ def harvest(self, query: dict[str, str]) -> OAIResponse:
time.sleep(retry_after)
http_response = self._request(query)
http_response.raise_for_status()
return OAIResponse(http_response, params=query)
return _build_response(http_response)

def _request(self, query: dict[str, str]) -> httpx.Response:
"""Send an HTTP request to the OAI server using the configured HTTP method and given query parameters.
Expand All @@ -195,7 +182,7 @@ def list_records(
set_: str | None = None,
resumption_token: str | None = None,
ignore_deleted: bool = False,
) -> Iterator[OAIResponse | Record]:
) -> Iterator[Response | Record]:
"""Issue a ListRecords request to the OAI server.
Send a request to list records from the OAI server, allowing for selective harvesting based on date range,
Expand Down Expand Up @@ -224,7 +211,7 @@ def list_records(
"""
_query = {
"verb": "ListRecords",
"verb": Verb.LIST_RECORDS.value,
"from": from_,
"until": until,
"metadataPrefix": metadata_prefix,
Expand All @@ -242,7 +229,7 @@ def list_identifiers(
set_: str | None = None,
resumption_token: str | None = None,
ignore_deleted: bool = False,
) -> Iterator[OAIResponse | Header]:
) -> Iterator[Response | Header]:
"""Issue a ListIdentifiers request to the OAI server.
Send a request to list record identifiers from the OAI server. This method allows filtering records based on
Expand Down Expand Up @@ -271,7 +258,7 @@ def list_identifiers(
"""
_query = {
"verb": "ListIdentifiers",
"verb": Verb.LIST_IDENTIFIERS.value,
"from": from_,
"until": until,
"metadataPrefix": metadata_prefix,
Expand All @@ -282,7 +269,7 @@ def list_identifiers(
query = remove_none_values(filter_dict_except_resumption_token(_query))
yield from self.iterator(self, query, ignore_deleted=ignore_deleted)

def list_sets(self, resumption_token: str | None = None) -> Iterator[OAIResponse | Set]:
def list_sets(self, resumption_token: str | None = None) -> Iterator[Response | Set]:
"""Issue a ListSets request to the OAI server.
Send a request to list all sets defined in the OAI server. Sets are used to categorize records in the OAI
Expand All @@ -303,7 +290,7 @@ def list_sets(self, resumption_token: str | None = None) -> Iterator[OAIResponse
"""
_query = {
"verb": "ListSets",
"verb": Verb.LIST_SETS.value,
"resumptionToken": resumption_token,
}
query = remove_none_values(filter_dict_except_resumption_token(_query))
Expand All @@ -323,10 +310,13 @@ def identify(self) -> Identify:
about the OAI server.
"""
query = {"verb": "Identify"}
return Identify(self.harvest(query))
# return Identify(self.harvest(query))

query = {"verb": Verb.IDENTIFY.value}
response = self.harvest(query)
return response.parsed.identify

def get_record(self, identifier: str, metadata_prefix: str = "oai_dc") -> OAIResponse | Record:
def get_record(self, identifier: str, metadata_prefix: str = "oai_dc") -> Response | Record:
"""Issue a GetRecord request to the OAI server.
Send a request to the OAI server to retrieve a specific record. The request is constructed with the provided
Expand All @@ -350,13 +340,13 @@ def get_record(self, identifier: str, metadata_prefix: str = "oai_dc") -> OAIRes
"""
query = {
"verb": "GetRecord",
"verb": Verb.GET_RECORD.value,
"identifier": identifier,
"metadataPrefix": metadata_prefix,
}
return next(iter(self.iterator(self, query)))

def list_metadata_formats(self, identifier: str | None = None) -> Iterator[OAIResponse | MetadataFormat]:
def list_metadata_formats(self, identifier: str | None = None) -> Iterator[Response | MetadataFormat]:
"""Issue a ListMetadataFormats request to the OAI server.
Send a request to list the metadata formats available from the OAI server. This can be done for the entire
Expand All @@ -380,7 +370,7 @@ def list_metadata_formats(self, identifier: str | None = None) -> Iterator[OAIRe
"""
_query = {
"verb": "ListMetadataFormats",
"verb": Verb.LIST_METADATA_FORMATS.value,
"identifier": identifier,
}
query = remove_none_values(_query)
Expand Down
118 changes: 67 additions & 51 deletions src/oaipmh_scythe/iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,50 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from enum import StrEnum
from operator import attrgetter
from typing import TYPE_CHECKING

from oaipmh_scythe import exceptions
from oaipmh_scythe.models import ResumptionToken
from oaipmh_scythe.models import Header, Record, Verb

if TYPE_CHECKING:
from collections.abc import Iterator

from oaipmh_scythe import Scythe
from oaipmh_scythe.models import OAIItem
from oaipmh_scythe.response import OAIResponse

VERBS_ELEMENTS: dict[str, str] = {
"GetRecord": "record",
"ListRecords": "record",
"ListIdentifiers": "header",
"ListSets": "set",
"ListMetadataFormats": "metadataFormat",
"Identify": "Identify",
from oaipmh_scythe.models import Header, MetadataFormat, Record, ResumptionToken, Set
from oaipmh_scythe.response import Response


class GetRecord(StrEnum):
attribute = "get_record"
element = "record"


class ListIdentifiers(StrEnum):
attribute = "list_identifiers"
element = "header"


class ListRecords(StrEnum):
attribute = "list_records"
element = "record"


class ListSets(StrEnum):
attribute = "list_sets"
element = "set"


class ListMetadataFormats(StrEnum):
attribute = "list_metadataformats"
element = "metadataformat"


MAPPING = {
Verb.LIST_IDENTIFIERS.value: ListIdentifiers,
Verb.GET_RECORD.value: GetRecord,
Verb.LIST_RECORDS.value: ListRecords,
Verb.LIST_SETS.value: ListSets,
}


Expand Down Expand Up @@ -66,8 +91,8 @@ def __init__(self, scythe: Scythe, query: dict[str, str], ignore_deleted: bool =
self.scythe = scythe
self.query = query
self.ignore_deleted = ignore_deleted
self.verb: str = self.query["verb"]
self.oai_response: OAIResponse | None = None
self.verb = self.query["verb"]
self.response: Response | None = None
self.resumption_token: ResumptionToken | None = None
self._next_response()

Expand All @@ -87,17 +112,10 @@ def _get_resumption_token(self) -> ResumptionToken | None:
Returns:
A ResumptionToken instance if a token is found in the response, otherwise None.
"""
ns = self.scythe.oai_namespace
if (
self.oai_response is not None
and (token_element := self.oai_response.xml.find(f".//{ns}resumptionToken")) is not None
):
return ResumptionToken(
token=token_element.text,
cursor=token_element.attrib.get("cursor"), # type: ignore [arg-type]
complete_list_size=token_element.attrib.get("completeListSize"), # type: ignore [arg-type]
expiration_date=token_element.attrib.get("expirationDate"), # type: ignore [arg-type]
)
if self.response is not None:
attribute = MAPPING[self.verb].attribute.value
parsed_data = getattr(self.response.parsed, attribute)
return parsed_data.resumption_token
return None

def _next_response(self) -> None:
Expand All @@ -108,18 +126,9 @@ def _next_response(self) -> None:
If an error is encountered in the OAI response, an appropriate exception is raised.
"""
if self.resumption_token and self.resumption_token.token:
self.query = {"verb": self.verb, "resumptionToken": self.resumption_token.token}
self.oai_response = self.scythe.harvest(self.query)

if (error := self.oai_response.xml.find(f".//{self.scythe.oai_namespace}error")) is not None:
code = str(error.attrib.get("code", "UNKNOWN"))
description = error.text or ""
try:
exception_name = code[0].upper() + code[1:]
raise getattr(exceptions, exception_name)(description)
except AttributeError as exc:
raise exceptions.GeneralOAIPMHError(description) from exc
if self.resumption_token is not None:
self.query = {"verb": self.verb, "resumptionToken": self.resumption_token.value}
self.response = self.scythe.harvest(self.query)
self.resumption_token = self._get_resumption_token()


Expand All @@ -131,7 +140,7 @@ class OAIResponseIterator(BaseOAIIterator):
underlying mechanisms of the BaseOAIIterator, including handling of resumption tokens for paginated data.
"""

def __iter__(self) -> Iterator[OAIResponse]:
def __iter__(self) -> Iterator[Response]:
"""Yield the next OAIResponse object from the server response sequence.
Enable the OAIResponseIterator to iterate over a series of OAIResponse objects, managing pagination
Expand All @@ -141,10 +150,10 @@ def __iter__(self) -> Iterator[OAIResponse]:
OAIResponse: The next available OAIResponse object in the sequence.
"""
while True:
if self.oai_response:
yield self.oai_response
self.oai_response = None
elif self.resumption_token and self.resumption_token.token:
if self.response:
yield self.response
self.response = None
elif self.resumption_token:
self._next_response()
else:
return
Expand All @@ -164,8 +173,10 @@ class OAIItemIterator(BaseOAIIterator):

def __init__(self, scythe: Scythe, query: dict[str, str], ignore_deleted: bool = False) -> None:
self.verb = query["verb"]
self.mapper = scythe.class_mapping[self.verb]
self.element = VERBS_ELEMENTS[self.verb]
attribute = MAPPING[query["verb"]].attribute.value
element = MAPPING[query["verb"]].element.value
self.items_getter = attrgetter(f"{attribute}.{element}")
print(self.items_getter)
super().__init__(scythe, query, ignore_deleted)

def _next_response(self) -> None:
Expand All @@ -175,12 +186,12 @@ def _next_response(self) -> None:
for the specific elements (e.g. records, headers) based on the current resumption token.
"""
super()._next_response()
if self.oai_response is not None:
self._items = self.oai_response.xml.iterfind(f".//{self.scythe.oai_namespace}{self.element}")
if self.response is not None:
self._items = self.items_getter(self.response.parsed)
else:
self._items = iter(())

def __iter__(self) -> Iterator[OAIItem]:
def __iter__(self) -> Iterator[Header | Record | Set | MetadataFormat]:
"""Iterate over individual OAI items from the response.
Go through the items in the OAI-PMH response, applying any necessary mapping and handling
Expand All @@ -191,11 +202,16 @@ def __iter__(self) -> Iterator[OAIItem]:
"""
while True:
for item in self._items:
mapped = self.mapper(item)
if self.ignore_deleted and mapped.deleted:
status = None
if isinstance(item, Header):
status = item.status
elif isinstance(item, Record):
status = item.header.status

if self.ignore_deleted and status and status.DELETED:
continue
yield mapped
if self.resumption_token and self.resumption_token.token:
yield item
if self.resumption_token:
self._next_response()
else:
return
Loading

0 comments on commit f1f1ad9

Please sign in to comment.