From f1f1ad9e73ccf96a4281f9848d9c328ea7450b1d Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Thu, 25 Jan 2024 13:44:36 +0100 Subject: [PATCH] feat!: add xsdata models --- .pre-commit-config.yaml | 5 +- pyproject.toml | 5 +- src/oaipmh_scythe/__init__.py | 4 +- src/oaipmh_scythe/client.py | 48 +- src/oaipmh_scythe/iterator.py | 118 ++-- src/oaipmh_scythe/models.py | 280 --------- src/oaipmh_scythe/models/.xsdata.xml | 39 ++ src/oaipmh_scythe/models/__init__.py | 14 + src/oaipmh_scythe/models/oai_dc.py | 261 +++++++++ src/oaipmh_scythe/models/oai_pmh.py | 652 +++++++++++++++++++++ src/oaipmh_scythe/response.py | 85 ++- src/oaipmh_scythe/utils.py | 59 +- tests/integration/test_get_record.py | 9 +- tests/integration/test_identify.py | 2 +- tests/integration/test_list_identifiers.py | 2 +- tests/integration/test_list_records.py | 4 +- tests/integration/test_list_sets.py | 2 +- tests/unit/test_iterator.py | 4 +- tests/unit/test_utils.py | 54 +- 19 files changed, 1134 insertions(+), 513 deletions(-) delete mode 100644 src/oaipmh_scythe/models.py create mode 100644 src/oaipmh_scythe/models/.xsdata.xml create mode 100644 src/oaipmh_scythe/models/__init__.py create mode 100644 src/oaipmh_scythe/models/oai_dc.py create mode 100644 src/oaipmh_scythe/models/oai_pmh.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f4f9d6c..f4c2ff7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -72,8 +72,9 @@ repos: - id: mypy args: [--config-file=pyproject.toml] additional_dependencies: - - httpx==0.26.0 - - lxml-stubs==0.5.1 + - httpx>=0.27.0 + - lxml-stubs>=0.5.1 + - xsdata>=24.4 exclude: tests - repo: https://github.com/scientific-python/cookie diff --git a/pyproject.toml b/pyproject.toml index 0b47ba6..b3010de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dynamic = [ ] dependencies = [ "httpx>=0.25", - "lxml>=5.1", + "xsdata[cli,lxml]", # TODO: remove cli extra ] [project.optional-dependencies] dev = [ @@ -169,6 +169,9 @@ pydocstyle.convention = "google" "src/oaipmh_scythe/client.py" = [ "PLR0913", # too-many-arguments ] +"src/oaipmh_scythe/models/*" = [ + "D106", +] "tests/*" = [ "D100", # undocumented-public-module "D103", # undocumented-public-function diff --git a/src/oaipmh_scythe/__init__.py b/src/oaipmh_scythe/__init__.py index c0135b6..d130b31 100644 --- a/src/oaipmh_scythe/__init__.py +++ b/src/oaipmh_scythe/__init__.py @@ -6,9 +6,9 @@ """oaipmh-scythe: A Scythe for harvesting OAI-PMH repositories.""" from oaipmh_scythe.client import Scythe -from oaipmh_scythe.response import OAIResponse +from oaipmh_scythe.response import Response __all__ = [ "Scythe", - "OAIResponse", + "Response", ] diff --git a/src/oaipmh_scythe/client.py b/src/oaipmh_scythe/client.py index b5856c8..ffbac0b 100644 --- a/src/oaipmh_scythe/client.py +++ b/src/oaipmh_scythe/client.py @@ -21,8 +21,8 @@ from oaipmh_scythe.__about__ import __version__ from oaipmh_scythe.iterator import BaseOAIIterator, OAIItemIterator -from oaipmh_scythe.models import Header, Identify, MetadataFormat, OAIItem, Record, Set -from oaipmh_scythe.response import OAIResponse +from oaipmh_scythe.models import Header, Identify, MetadataFormat, Record, Set, Verb +from oaipmh_scythe.response import Response, _build_response from oaipmh_scythe.utils import filter_dict_except_resumption_token, log_response, remove_none_values if TYPE_CHECKING: @@ -37,17 +37,6 @@ OAI_NAMESPACE: str = "{http://www.openarchives.org/OAI/2.0/}" -# Map OAI verbs to class representations -DEFAULT_CLASS_MAP = { - "GetRecord": Record, - "ListRecords": Record, - "ListIdentifiers": Header, - "ListSets": Set, - "ListMetadataFormats": MetadataFormat, - "Identify": Identify, -} - - class Scythe: """A client for interacting with OAI-PMH interfaces, facilitating the harvesting of records, identifiers, and sets. @@ -82,7 +71,6 @@ def __init__( max_retries: int = 0, retry_status_codes: Iterable[int] | None = None, default_retry_after: int = 60, - class_mapping: dict[str, type[OAIItem]] | None = None, encoding: str = "utf-8", auth: AuthTypes | None = None, timeout: int = 60, @@ -99,7 +87,6 @@ def __init__( self.retry_status_codes = retry_status_codes or (503,) self.default_retry_after = default_retry_after self.oai_namespace = OAI_NAMESPACE - self.class_mapping = class_mapping or DEFAULT_CLASS_MAP self.encoding = encoding self.auth = auth self.timeout = timeout @@ -149,7 +136,7 @@ def __exit__( ) -> None: self.close() - def harvest(self, query: dict[str, str]) -> OAIResponse: + def harvest(self, query: dict[str, str]) -> Response: """Perform an HTTP request to the OAI server with the given parameters. Send an OAI-PMH request to the server using the specified parameters. Handle retry logic @@ -172,7 +159,7 @@ def harvest(self, query: dict[str, str]) -> OAIResponse: time.sleep(retry_after) http_response = self._request(query) http_response.raise_for_status() - return OAIResponse(http_response, params=query) + return _build_response(http_response) def _request(self, query: dict[str, str]) -> httpx.Response: """Send an HTTP request to the OAI server using the configured HTTP method and given query parameters. @@ -195,7 +182,7 @@ def list_records( set_: str | None = None, resumption_token: str | None = None, ignore_deleted: bool = False, - ) -> Iterator[OAIResponse | Record]: + ) -> Iterator[Response | Record]: """Issue a ListRecords request to the OAI server. Send a request to list records from the OAI server, allowing for selective harvesting based on date range, @@ -224,7 +211,7 @@ def list_records( """ _query = { - "verb": "ListRecords", + "verb": Verb.LIST_RECORDS.value, "from": from_, "until": until, "metadataPrefix": metadata_prefix, @@ -242,7 +229,7 @@ def list_identifiers( set_: str | None = None, resumption_token: str | None = None, ignore_deleted: bool = False, - ) -> Iterator[OAIResponse | Header]: + ) -> Iterator[Response | Header]: """Issue a ListIdentifiers request to the OAI server. Send a request to list record identifiers from the OAI server. This method allows filtering records based on @@ -271,7 +258,7 @@ def list_identifiers( """ _query = { - "verb": "ListIdentifiers", + "verb": Verb.LIST_IDENTIFIERS.value, "from": from_, "until": until, "metadataPrefix": metadata_prefix, @@ -282,7 +269,7 @@ def list_identifiers( query = remove_none_values(filter_dict_except_resumption_token(_query)) yield from self.iterator(self, query, ignore_deleted=ignore_deleted) - def list_sets(self, resumption_token: str | None = None) -> Iterator[OAIResponse | Set]: + def list_sets(self, resumption_token: str | None = None) -> Iterator[Response | Set]: """Issue a ListSets request to the OAI server. Send a request to list all sets defined in the OAI server. Sets are used to categorize records in the OAI @@ -303,7 +290,7 @@ def list_sets(self, resumption_token: str | None = None) -> Iterator[OAIResponse """ _query = { - "verb": "ListSets", + "verb": Verb.LIST_SETS.value, "resumptionToken": resumption_token, } query = remove_none_values(filter_dict_except_resumption_token(_query)) @@ -323,10 +310,13 @@ def identify(self) -> Identify: about the OAI server. """ - query = {"verb": "Identify"} - return Identify(self.harvest(query)) + # return Identify(self.harvest(query)) + + query = {"verb": Verb.IDENTIFY.value} + response = self.harvest(query) + return response.parsed.identify - def get_record(self, identifier: str, metadata_prefix: str = "oai_dc") -> OAIResponse | Record: + def get_record(self, identifier: str, metadata_prefix: str = "oai_dc") -> Response | Record: """Issue a GetRecord request to the OAI server. Send a request to the OAI server to retrieve a specific record. The request is constructed with the provided @@ -350,13 +340,13 @@ def get_record(self, identifier: str, metadata_prefix: str = "oai_dc") -> OAIRes """ query = { - "verb": "GetRecord", + "verb": Verb.GET_RECORD.value, "identifier": identifier, "metadataPrefix": metadata_prefix, } return next(iter(self.iterator(self, query))) - def list_metadata_formats(self, identifier: str | None = None) -> Iterator[OAIResponse | MetadataFormat]: + def list_metadata_formats(self, identifier: str | None = None) -> Iterator[Response | MetadataFormat]: """Issue a ListMetadataFormats request to the OAI server. Send a request to list the metadata formats available from the OAI server. This can be done for the entire @@ -380,7 +370,7 @@ def list_metadata_formats(self, identifier: str | None = None) -> Iterator[OAIRe """ _query = { - "verb": "ListMetadataFormats", + "verb": Verb.LIST_METADATA_FORMATS.value, "identifier": identifier, } query = remove_none_values(_query) diff --git a/src/oaipmh_scythe/iterator.py b/src/oaipmh_scythe/iterator.py index 18cc6a5..cf3bc92 100644 --- a/src/oaipmh_scythe/iterator.py +++ b/src/oaipmh_scythe/iterator.py @@ -19,25 +19,50 @@ from __future__ import annotations from abc import ABC, abstractmethod +from enum import StrEnum +from operator import attrgetter from typing import TYPE_CHECKING -from oaipmh_scythe import exceptions -from oaipmh_scythe.models import ResumptionToken +from oaipmh_scythe.models import Header, Record, Verb if TYPE_CHECKING: from collections.abc import Iterator from oaipmh_scythe import Scythe - from oaipmh_scythe.models import OAIItem - from oaipmh_scythe.response import OAIResponse - -VERBS_ELEMENTS: dict[str, str] = { - "GetRecord": "record", - "ListRecords": "record", - "ListIdentifiers": "header", - "ListSets": "set", - "ListMetadataFormats": "metadataFormat", - "Identify": "Identify", + from oaipmh_scythe.models import Header, MetadataFormat, Record, ResumptionToken, Set + from oaipmh_scythe.response import Response + + +class GetRecord(StrEnum): + attribute = "get_record" + element = "record" + + +class ListIdentifiers(StrEnum): + attribute = "list_identifiers" + element = "header" + + +class ListRecords(StrEnum): + attribute = "list_records" + element = "record" + + +class ListSets(StrEnum): + attribute = "list_sets" + element = "set" + + +class ListMetadataFormats(StrEnum): + attribute = "list_metadataformats" + element = "metadataformat" + + +MAPPING = { + Verb.LIST_IDENTIFIERS.value: ListIdentifiers, + Verb.GET_RECORD.value: GetRecord, + Verb.LIST_RECORDS.value: ListRecords, + Verb.LIST_SETS.value: ListSets, } @@ -66,8 +91,8 @@ def __init__(self, scythe: Scythe, query: dict[str, str], ignore_deleted: bool = self.scythe = scythe self.query = query self.ignore_deleted = ignore_deleted - self.verb: str = self.query["verb"] - self.oai_response: OAIResponse | None = None + self.verb = self.query["verb"] + self.response: Response | None = None self.resumption_token: ResumptionToken | None = None self._next_response() @@ -87,17 +112,10 @@ def _get_resumption_token(self) -> ResumptionToken | None: Returns: A ResumptionToken instance if a token is found in the response, otherwise None. """ - ns = self.scythe.oai_namespace - if ( - self.oai_response is not None - and (token_element := self.oai_response.xml.find(f".//{ns}resumptionToken")) is not None - ): - return ResumptionToken( - token=token_element.text, - cursor=token_element.attrib.get("cursor"), # type: ignore [arg-type] - complete_list_size=token_element.attrib.get("completeListSize"), # type: ignore [arg-type] - expiration_date=token_element.attrib.get("expirationDate"), # type: ignore [arg-type] - ) + if self.response is not None: + attribute = MAPPING[self.verb].attribute.value + parsed_data = getattr(self.response.parsed, attribute) + return parsed_data.resumption_token return None def _next_response(self) -> None: @@ -108,18 +126,9 @@ def _next_response(self) -> None: If an error is encountered in the OAI response, an appropriate exception is raised. """ - if self.resumption_token and self.resumption_token.token: - self.query = {"verb": self.verb, "resumptionToken": self.resumption_token.token} - self.oai_response = self.scythe.harvest(self.query) - - if (error := self.oai_response.xml.find(f".//{self.scythe.oai_namespace}error")) is not None: - code = str(error.attrib.get("code", "UNKNOWN")) - description = error.text or "" - try: - exception_name = code[0].upper() + code[1:] - raise getattr(exceptions, exception_name)(description) - except AttributeError as exc: - raise exceptions.GeneralOAIPMHError(description) from exc + if self.resumption_token is not None: + self.query = {"verb": self.verb, "resumptionToken": self.resumption_token.value} + self.response = self.scythe.harvest(self.query) self.resumption_token = self._get_resumption_token() @@ -131,7 +140,7 @@ class OAIResponseIterator(BaseOAIIterator): underlying mechanisms of the BaseOAIIterator, including handling of resumption tokens for paginated data. """ - def __iter__(self) -> Iterator[OAIResponse]: + def __iter__(self) -> Iterator[Response]: """Yield the next OAIResponse object from the server response sequence. Enable the OAIResponseIterator to iterate over a series of OAIResponse objects, managing pagination @@ -141,10 +150,10 @@ def __iter__(self) -> Iterator[OAIResponse]: OAIResponse: The next available OAIResponse object in the sequence. """ while True: - if self.oai_response: - yield self.oai_response - self.oai_response = None - elif self.resumption_token and self.resumption_token.token: + if self.response: + yield self.response + self.response = None + elif self.resumption_token: self._next_response() else: return @@ -164,8 +173,10 @@ class OAIItemIterator(BaseOAIIterator): def __init__(self, scythe: Scythe, query: dict[str, str], ignore_deleted: bool = False) -> None: self.verb = query["verb"] - self.mapper = scythe.class_mapping[self.verb] - self.element = VERBS_ELEMENTS[self.verb] + attribute = MAPPING[query["verb"]].attribute.value + element = MAPPING[query["verb"]].element.value + self.items_getter = attrgetter(f"{attribute}.{element}") + print(self.items_getter) super().__init__(scythe, query, ignore_deleted) def _next_response(self) -> None: @@ -175,12 +186,12 @@ def _next_response(self) -> None: for the specific elements (e.g. records, headers) based on the current resumption token. """ super()._next_response() - if self.oai_response is not None: - self._items = self.oai_response.xml.iterfind(f".//{self.scythe.oai_namespace}{self.element}") + if self.response is not None: + self._items = self.items_getter(self.response.parsed) else: self._items = iter(()) - def __iter__(self) -> Iterator[OAIItem]: + def __iter__(self) -> Iterator[Header | Record | Set | MetadataFormat]: """Iterate over individual OAI items from the response. Go through the items in the OAI-PMH response, applying any necessary mapping and handling @@ -191,11 +202,16 @@ def __iter__(self) -> Iterator[OAIItem]: """ while True: for item in self._items: - mapped = self.mapper(item) - if self.ignore_deleted and mapped.deleted: + status = None + if isinstance(item, Header): + status = item.status + elif isinstance(item, Record): + status = item.header.status + + if self.ignore_deleted and status and status.DELETED: continue - yield mapped - if self.resumption_token and self.resumption_token.token: + yield item + if self.resumption_token: self._next_response() else: return diff --git a/src/oaipmh_scythe/models.py b/src/oaipmh_scythe/models.py deleted file mode 100644 index f2464d9..0000000 --- a/src/oaipmh_scythe/models.py +++ /dev/null @@ -1,280 +0,0 @@ -# SPDX-FileCopyrightText: 2015 Mathias Loesch -# SPDX-FileCopyrightText: 2023 Heinz-Alexander Fütterer -# -# SPDX-License-Identifier: BSD-3-Clause - -"""The models module defines data structures for representing various components of the OAI-PMH protocol. - -This module includes classes that encapsulate different entities in OAI-PMH, such as resumption tokens and -various types of OAI items. These classes provide structured representations of OAI-PMH elements, -facilitating their manipulation and processing in client applications. - -Classes: - ResumptionToken: Represents a resumption token used in OAI-PMH for paginated data retrieval. - OAIItem: A base class for generic OAI items. - Identify: Represents an Identify response in OAI-PMH. - Header: Represents an OAI Header element. - Record: Represents an OAI Record element. - Set: Represents an OAI Set element. - MetadataFormat: Represents an OAI MetadataFormat element. -""" - -from __future__ import annotations - -from dataclasses import dataclass -from typing import TYPE_CHECKING - -from lxml import etree - -from oaipmh_scythe.utils import get_namespace, xml_to_dict - -if TYPE_CHECKING: - from collections.abc import Iterator - - from oaipmh_scythe.response import OAIResponse - - -@dataclass -class ResumptionToken: - """A data class representing a resumption token in the OAI-PMH protocol. - - Resumption tokens are used for iterating over multiple sets of results in OAI-PMH - harvest requests. This class encapsulates the typical components of a resumption token, - including the token itself, cursor, complete list size, and an expiration date. - - Attributes: - token: The actual resumption token used for continuing the iteration in subsequent OAI-PMH requests. - Default is None. - cursor: A marker indicating the current position in the list of results. Default is None. - complete_list_size: The total number of records in the complete list of results. Default is None. - expiration_date: The date and time when the resumption token expires. Default is None. - """ - - token: str | None = None - cursor: str | None = None - complete_list_size: str | None = None - expiration_date: str | None = None - - def __repr__(self) -> str: - return f"" - - -class OAIItem: - """A base class representing a generic item in the OAI-PMH protocol. - - This class provides a common structure for handling and manipulating XML data - associated with different types of OAI-PMH items, such as records, headers, or sets. - - Attributes: - xml: The parsed XML element representing the OAI item. - _strip_ns: A flag indicating whether to remove the namespaces from the element names - in the dictionary representation. - _oai_namespace: The namespace URI extracted from the XML element. - """ - - def __init__(self, xml: etree._Element, strip_ns: bool = True) -> None: - super().__init__() - self.xml = xml - self._strip_ns = strip_ns - self._oai_namespace = get_namespace(self.xml) - - def __bytes__(self) -> bytes: - return etree.tostring(self.xml, encoding="utf-8") - - def __str__(self) -> str: - return etree.tostring(self.xml, encoding="unicode") - - @property - def raw(self) -> str: - """Return the original XML as a unicode string.""" - return etree.tostring(self.xml, encoding="unicode") - - -class Identify(OAIItem): - """A class representing an Identify container in the OAI-PMH protocol. - - This class is specifically used for handling the response of an Identify request in OAI-PMH. - It differs from other OAI entities in that it is initialized with an OAIResponse object - rather than a direct XML element. The class parses the Identify information from the - response and provides access to its individual components. - - Args: - identify_response: The response object from an Identify request. - It should contain the XML representation of the Identify response. - - Attributes: - xml: The XML element representing the Identify response. - _identify_dict: A dictionary containing the parsed Identify information. - Dynamic Attributes: Based on the content of the Identify response, additional attributes - are dynamically set on this object. These can include attributes like - repository name, base URL, protocol version, etc. - - Raises: - ValueError: If the Identify element is not found in the provided XML. - """ - - def __init__(self, identify_response: OAIResponse) -> None: - super().__init__(identify_response.xml, strip_ns=True) - identify_element = self.xml.find(f".//{self._oai_namespace}Identify") - if identify_element is None: - raise ValueError("Identify element not found in the XML.") - self.xml = identify_element - self._identify_dict = xml_to_dict(self.xml, strip_ns=True) - for k, v in self._identify_dict.items(): - setattr(self, k.replace("-", "_"), v[0]) - - def __repr__(self) -> str: - return "" - - def __iter__(self) -> Iterator: - """Iterate over the Identify information, yielding key-value pairs.""" - return iter(self._identify_dict.items()) - - -class Header(OAIItem): - """A class representing an OAI Header in the OAI-PMH protocol. - - The header contains essential information about a record, such as its identifier, datestamp, - and set specifications. This class parses these details from the provided XML header element - and makes them easily accessible as attributes. - - Args: - header_element: The XML element representing the OAI header. - - Attributes: - deleted: Indicates whether the record is marked as deleted in the OAI-PMH repository. - identifier: The unique identifier of the record in the OAI-PMH repository. - datestamp: The datestamp of the record, indicating when it was last updated. - setSpecs: A list of set specifications that the record belongs to. - """ - - def __init__(self, header_element: etree._Element) -> None: - super().__init__(header_element, strip_ns=True) - self.deleted = self.xml.attrib.get("status") == "deleted" - _identifier_element = self.xml.find(f"{self._oai_namespace}identifier") - _datestamp_element = self.xml.find(f"{self._oai_namespace}datestamp") - - self.identifier = getattr(_identifier_element, "text", None) - self.datestamp = getattr(_datestamp_element, "text", None) - self.setSpecs = [setSpec.text for setSpec in self.xml.findall(f"{self._oai_namespace}setSpec")] - - def __repr__(self) -> str: - return f"
" - - def __iter__(self) -> Iterator: - """Iterate over the header information, yielding key-value pairs.""" - return iter( - [ - ("identifier", self.identifier), - ("datestamp", self.datestamp), - ("setSpecs", self.setSpecs), - ] - ) - - -class Record(OAIItem): - """A class representing an OAI record in the OAI-PMH protocol. - - This class encapsulates a record element from an OAI-PMH response, handling its parsing, and providing - structured access to its details, such as header information and metadata. It checks for the presence of - the header and metadata elements and raises an error if the header is not found. - - Args: - record_element: The XML element representing the OAI record. - strip_ns: If True, namespaces are removed from the element names in the parsed metadata. Defaults to True. - - Attributes: - header: An instance of the Header class representing the header information of the record. - deleted: Indicates whether the record is marked as deleted. - metadata: A dictionary representation of the record's metadata, if available and not deleted. - - Raises: - ValueError: If the header element is not found in the provided XML. - """ - - def __init__(self, record_element: etree._Element, strip_ns: bool = True) -> None: - super().__init__(record_element, strip_ns=strip_ns) - header_element = self.xml.find(f".//{self._oai_namespace}header") - if header_element is None: - raise ValueError("Header element not found in the XML.") - self.header = Header(header_element) - self.deleted = self.header.deleted - if not self.deleted: - self.metadata = self.get_metadata() - - def __repr__(self) -> str: - return f"" - - def __iter__(self) -> Iterator: - """Iterate over the record's metadata, yielding key-value pairs.""" - return iter(self.metadata.items()) - - def get_metadata(self): - """Extract and return the record's metadata as a dictionary.""" - # We want to get record/metadata//* - # would be the element ``dc`` - # in the ``oai_dc`` case. - return xml_to_dict( - self.xml.find(".//" + self._oai_namespace + "metadata").getchildren()[0], - strip_ns=self._strip_ns, - ) - - -class Set(OAIItem): - """A class representing a set in the OAI-PMH protocol. - - This class encapsulates a set element from an OAI-PMH response and provides structured access to its details. - It parses the set information from the provided XML element and dynamically sets attributes - based on the parsed content. - - Args: - set_element: The XML element representing the OAI set. The element is parsed to extract set details. - - Attributes: - setName: The name of the set, extracted from the set's XML element. - _set_dict: A dictionary containing the parsed set information. - """ - - def __init__(self, set_element: etree._Element) -> None: - super().__init__(set_element, strip_ns=True) - self._set_dict = xml_to_dict(self.xml, strip_ns=True) - self.setName: str | None = None - for k, v in self._set_dict.items(): - setattr(self, k.replace("-", "_"), v[0]) - - def __repr__(self) -> str: - return f"" - - def __iter__(self) -> Iterator: - """Iterate over the set information, yielding key-value pairs.""" - return iter(self._set_dict.items()) - - -class MetadataFormat(OAIItem): - """A class representing a metadata format in the OAI-PMH protocol. - - This class handles the representation of a metadata format, which is an essential part of the OAI-PMH protocol. - It parses the provided XML element to extract and store metadata format details such as the metadata prefix. - - Args: - mdf_element: The XML element representing the metadata format. This element is parsed - to extract metadata format details. - - Attributes: - metadataPrefix: The prefix of the metadata format, extracted from the XML element. - _mdf_dict: A dictionary containing the parsed metadata format details. - """ - - def __init__(self, mdf_element: etree._Element) -> None: - super().__init__(mdf_element, strip_ns=True) - self._mdf_dict = xml_to_dict(self.xml, strip_ns=True) - self.metadataPrefix: str | None = None - for k, v in self._mdf_dict.items(): - setattr(self, k.replace("-", "_"), v[0]) - - def __repr__(self) -> str: - return f"" - - def __iter__(self) -> Iterator: - """Iterate over the metadata format information, yielding key-value pairs.""" - return iter(self._mdf_dict.items()) diff --git a/src/oaipmh_scythe/models/.xsdata.xml b/src/oaipmh_scythe/models/.xsdata.xml new file mode 100644 index 0000000..7fd5df4 --- /dev/null +++ b/src/oaipmh_scythe/models/.xsdata.xml @@ -0,0 +1,39 @@ + + + + generated + dataclasses + single-package + Google + false + false + false + true + false + false + true + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/oaipmh_scythe/models/__init__.py b/src/oaipmh_scythe/models/__init__.py new file mode 100644 index 0000000..52e8445 --- /dev/null +++ b/src/oaipmh_scythe/models/__init__.py @@ -0,0 +1,14 @@ +from oaipmh_scythe.models.oai_pmh import Header, Identify, MetadataFormat, Record, ResumptionToken, Set, Verb + +Item = Header | Record | Set | MetadataFormat + +__all__ = [ + "Header", + "Identify", + "MetadataFormat", + "Record", + "ResumptionToken", + "Set", + "Verb", + "Item", +] diff --git a/src/oaipmh_scythe/models/oai_dc.py b/src/oaipmh_scythe/models/oai_dc.py new file mode 100644 index 0000000..7f137dd --- /dev/null +++ b/src/oaipmh_scythe/models/oai_dc.py @@ -0,0 +1,261 @@ +"""This file was generated by xsdata, v24.4, on 2024-04-04 16:40:52 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum + + +class LangValue(Enum): + VALUE = "" + + +@dataclass(slots=True) +class ElementType: + class Meta: + name = "elementType" + target_namespace = "http://purl.org/dc/elements/1.1/" + + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + lang: None | str | LangValue = field( + default=None, + metadata={ + "type": "Attribute", + "namespace": "http://www.w3.org/XML/1998/namespace", + }, + ) + + +@dataclass(slots=True) +class Contributor(ElementType): + class Meta: + name = "contributor" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Coverage(ElementType): + class Meta: + name = "coverage" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Creator(ElementType): + class Meta: + name = "creator" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Date(ElementType): + class Meta: + name = "date" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Description(ElementType): + class Meta: + name = "description" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Format(ElementType): + class Meta: + name = "format" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Identifier(ElementType): + class Meta: + name = "identifier" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Language(ElementType): + class Meta: + name = "language" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Publisher(ElementType): + class Meta: + name = "publisher" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Relation(ElementType): + class Meta: + name = "relation" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Rights(ElementType): + class Meta: + name = "rights" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Source(ElementType): + class Meta: + name = "source" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Subject(ElementType): + class Meta: + name = "subject" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Title(ElementType): + class Meta: + name = "title" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class TypeType(ElementType): + class Meta: + name = "type" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class OaiDcType: + class Meta: + name = "oai_dcType" + target_namespace = "http://www.openarchives.org/OAI/2.0/oai_dc/" + + title: list[Title] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + creator: list[Creator] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + subject: list[Subject] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + description: list[Description] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + publisher: list[Publisher] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + contributor: list[Contributor] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + date: list[Date] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + type_value: list[TypeType] = field( + default_factory=list, + metadata={ + "name": "type", + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + format: list[Format] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + identifier: list[Identifier] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + source: list[Source] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + language: list[Language] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + relation: list[Relation] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + coverage: list[Coverage] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + rights: list[Rights] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + + +@dataclass(slots=True) +class Dc(OaiDcType): + class Meta: + name = "dc" + namespace = "http://www.openarchives.org/OAI/2.0/oai_dc/" diff --git a/src/oaipmh_scythe/models/oai_pmh.py b/src/oaipmh_scythe/models/oai_pmh.py new file mode 100644 index 0000000..2077709 --- /dev/null +++ b/src/oaipmh_scythe/models/oai_pmh.py @@ -0,0 +1,652 @@ +"""This file was generated by xsdata, v24.4, on 2024-04-04 16:47:04 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum + +from xsdata.models.datatype import XmlDate, XmlDateTime + +__NAMESPACE__ = "http://www.openarchives.org/OAI/2.0/" + + +class OaiPmherrorcode(Enum): + CANNOT_DISSEMINATE_FORMAT = "cannotDisseminateFormat" + ID_DOES_NOT_EXIST = "idDoesNotExist" + BAD_ARGUMENT = "badArgument" + BAD_VERB = "badVerb" + NO_METADATA_FORMATS = "noMetadataFormats" + NO_RECORDS_MATCH = "noRecordsMatch" + BAD_RESUMPTION_TOKEN = "badResumptionToken" + NO_SET_HIERARCHY = "noSetHierarchy" + + +@dataclass(slots=True) +class About: + """Data "about" the record must be expressed in XML that is compliant with an XML + Schema defined by a community. + """ + + class Meta: + name = "aboutType" + + other_element: None | object = field( + default=None, + metadata={ + "type": "Wildcard", + "namespace": "##other", + }, + ) + + +class DeletedRecord(Enum): + NO = "no" + PERSISTENT = "persistent" + TRANSIENT = "transient" + + +@dataclass(slots=True) +class Description: + """The descriptionType is used for the description element in Identify and for + setDescription element in ListSets. + + Content must be compliant with an XML Schema defined by a community. + """ + + class Meta: + name = "descriptionType" + + other_element: None | object = field( + default=None, + metadata={ + "type": "Wildcard", + "namespace": "##other", + }, + ) + + +class Granularity(Enum): + YYYY_MM_DD = "YYYY-MM-DD" + YYYY_MM_DDTHH_MM_SS_Z = "YYYY-MM-DDThh:mm:ssZ" + + +@dataclass(slots=True) +class MetadataFormat: + class Meta: + name = "metadataFormatType" + + metadata_prefix: None | str = field( + default=None, + metadata={ + "name": "metadataPrefix", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + "pattern": r"[A-Za-z0-9\-_\.!~\*'\(\)]+", + }, + ) + schema: None | str = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + metadata_namespace: None | str = field( + default=None, + metadata={ + "name": "metadataNamespace", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + + +@dataclass(slots=True) +class Metadata: + """Metadata must be expressed in XML that complies with another XML Schema + (namespace=#other). + + Metadata must be explicitly qualified in the response. + """ + + class Meta: + name = "metadataType" + + other_element: None | object = field( + default=None, + metadata={ + "type": "Wildcard", + "namespace": "##other", + }, + ) + + +class ProtocolVersion(Enum): + VALUE_2_0 = "2.0" + + +@dataclass(slots=True) +class ResumptionToken: + """A resumptionToken may have 3 optional attributes and can be used in ListSets, + ListIdentifiers, ListRecords responses. + """ + + class Meta: + name = "resumptionTokenType" + + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + expiration_date: None | XmlDateTime = field( + default=None, + metadata={ + "name": "expirationDate", + "type": "Attribute", + }, + ) + complete_list_size: None | int = field( + default=None, + metadata={ + "name": "completeListSize", + "type": "Attribute", + }, + ) + cursor: None | int = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + + +class Status(Enum): + DELETED = "deleted" + + +class Verb(Enum): + IDENTIFY = "Identify" + LIST_METADATA_FORMATS = "ListMetadataFormats" + LIST_SETS = "ListSets" + GET_RECORD = "GetRecord" + LIST_IDENTIFIERS = "ListIdentifiers" + LIST_RECORDS = "ListRecords" + + +@dataclass(slots=True) +class Identify: + class Meta: + name = "IdentifyType" + + repository_name: None | str = field( + default=None, + metadata={ + "name": "repositoryName", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + base_url: None | str = field( + default=None, + metadata={ + "name": "baseURL", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + protocol_version: None | ProtocolVersion = field( + default=None, + metadata={ + "name": "protocolVersion", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + admin_email: list[str] = field( + default_factory=list, + metadata={ + "name": "adminEmail", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "min_occurs": 1, + "pattern": r"\S+@(\S+\.)+\S+", + }, + ) + earliest_datestamp: None | XmlDate | str = field( + default=None, + metadata={ + "name": "earliestDatestamp", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + "pattern": r".*Z", + }, + ) + deleted_record: None | DeletedRecord = field( + default=None, + metadata={ + "name": "deletedRecord", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + granularity: None | Granularity = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + compression: list[str] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + description: list[Description] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class ListMetadataFormats: + class Meta: + name = "ListMetadataFormatsType" + + metadata_format: list[MetadataFormat] = field( + default_factory=list, + metadata={ + "name": "metadataFormat", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "min_occurs": 1, + }, + ) + + +@dataclass(slots=True) +class OaiPmherror: + class Meta: + name = "OAI-PMHerrorType" + + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + code: None | OaiPmherrorcode = field( + default=None, + metadata={ + "type": "Attribute", + "required": True, + }, + ) + + +@dataclass(slots=True) +class Header: + """A header has a unique identifier, a datestamp, and setSpec(s) in case the + item from which the record is disseminated belongs to set(s). + + the header can carry a deleted status indicating that the record is deleted. + """ + + class Meta: + name = "headerType" + + identifier: None | str = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + datestamp: None | XmlDate | str = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + "pattern": r".*Z", + }, + ) + set_spec: list[str] = field( + default_factory=list, + metadata={ + "name": "setSpec", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "pattern": r"([A-Za-z0-9\-_\.!~\*'\(\)])+(:[A-Za-z0-9\-_\.!~\*'\(\)]+)*", + }, + ) + status: None | Status = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + + +@dataclass(slots=True) +class Request: + """Define requestType, indicating the protocol request that led to the + response. + + Element content is BASE-URL, attributes are arguments of protocol request, attribute-values are values of + arguments of protocol request + """ + + class Meta: + name = "requestType" + + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + verb: None | Verb = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + identifier: None | str = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + metadata_prefix: None | str = field( + default=None, + metadata={ + "name": "metadataPrefix", + "type": "Attribute", + "pattern": r"[A-Za-z0-9\-_\.!~\*'\(\)]+", + }, + ) + from_value: None | XmlDate | str = field( + default=None, + metadata={ + "name": "from", + "type": "Attribute", + "pattern": r".*Z", + }, + ) + until: None | XmlDate | str = field( + default=None, + metadata={ + "type": "Attribute", + "pattern": r".*Z", + }, + ) + set: None | str = field( + default=None, + metadata={ + "type": "Attribute", + "pattern": r"([A-Za-z0-9\-_\.!~\*'\(\)])+(:[A-Za-z0-9\-_\.!~\*'\(\)]+)*", + }, + ) + resumption_token: None | str = field( + default=None, + metadata={ + "name": "resumptionToken", + "type": "Attribute", + }, + ) + + +@dataclass(slots=True) +class Set: + class Meta: + name = "setType" + + set_spec: None | str = field( + default=None, + metadata={ + "name": "setSpec", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + "pattern": r"([A-Za-z0-9\-_\.!~\*'\(\)])+(:[A-Za-z0-9\-_\.!~\*'\(\)]+)*", + }, + ) + set_name: None | str = field( + default=None, + metadata={ + "name": "setName", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + set_description: list[Description] = field( + default_factory=list, + metadata={ + "name": "setDescription", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class ListIdentifiers: + class Meta: + name = "ListIdentifiersType" + + header: list[Header] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "min_occurs": 1, + }, + ) + resumption_token: None | ResumptionToken = field( + default=None, + metadata={ + "name": "resumptionToken", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class ListSets: + class Meta: + name = "ListSetsType" + + set: list[Set] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "min_occurs": 1, + }, + ) + resumption_token: None | ResumptionToken = field( + default=None, + metadata={ + "name": "resumptionToken", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class Record: + """A record has a header, a metadata part, and an optional about container.""" + + class Meta: + name = "recordType" + + header: None | Header = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + metadata: None | Metadata = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + about: list[About] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class GetRecord: + class Meta: + name = "GetRecordType" + + record: None | Record = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + + +@dataclass(slots=True) +class ListRecords: + class Meta: + name = "ListRecordsType" + + record: list[Record] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "min_occurs": 1, + }, + ) + resumption_token: None | ResumptionToken = field( + default=None, + metadata={ + "name": "resumptionToken", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class OaiPmhtype: + class Meta: + name = "OAI-PMHtype" + + response_date: None | XmlDateTime = field( + default=None, + metadata={ + "name": "responseDate", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + request: None | Request = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + error: list[OaiPmherror] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + identify: None | Identify = field( + default=None, + metadata={ + "name": "Identify", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + list_metadata_formats: None | ListMetadataFormats = field( + default=None, + metadata={ + "name": "ListMetadataFormats", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + list_sets: None | ListSets = field( + default=None, + metadata={ + "name": "ListSets", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + get_record: None | GetRecord = field( + default=None, + metadata={ + "name": "GetRecord", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + list_identifiers: None | ListIdentifiers = field( + default=None, + metadata={ + "name": "ListIdentifiers", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + list_records: None | ListRecords = field( + default=None, + metadata={ + "name": "ListRecords", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class OaiPmh(OaiPmhtype): + class Meta: + name = "OAI-PMH" + namespace = "http://www.openarchives.org/OAI/2.0/" diff --git a/src/oaipmh_scythe/response.py b/src/oaipmh_scythe/response.py index 0d49f41..b58ae29 100644 --- a/src/oaipmh_scythe/response.py +++ b/src/oaipmh_scythe/response.py @@ -15,40 +15,75 @@ from dataclasses import dataclass from typing import TYPE_CHECKING -from lxml import etree +import httpx +from xsdata.formats.dataclass.parsers import XmlParser + +from oaipmh_scythe import exceptions +from oaipmh_scythe.models.oai_pmh import OaiPmh if TYPE_CHECKING: - from httpx import Response + from oaipmh_scythe.models.oai_pmh import OaiPmherror -XMLParser = etree.XMLParser(remove_blank_text=True, recover=True, resolve_entities=False) +# Ref: https://github.com/openapi-generators/openapi-python-client/blob/main/end_to_end_tests/golden-record/my_test_api_client/types.py +# Ref: https://github.com/openapi-generators/openapi-python-client/blob/main/end_to_end_tests/golden-record/my_test_api_client/api/responses/text_response.py -@dataclass -class OAIResponse: - """Represents a response received from an OAI server, encapsulating the raw HTTP response and parsed XML content. +parser = XmlParser() + + +def _build_response(http_response: httpx.Response) -> Response: + parsed = _parse_response(http_response.text) + http_response.raise_for_status() + response = Response( + status_code=httpx.codes(http_response.status_code), + content=http_response.content, + headers=http_response.headers, + parsed=parsed, + ) + return response + + +def _parse_response(text: str) -> OaiPmh: + """Parse the response text into an `OaiPmh` object. - This class provides a structured way to access various aspects of an OAI server's response. - It offers methods to retrieve the raw text of the response, parse it as XML, - and obtain a string representation of the response that includes the OAI verb. + Args: + text (str): The response text to parse. - Attributes: - http_response: The original HTTP response object from the OAI server. - params: A dictionary of the OAI parameters used in the request that led to this response. + Returns: + OaiPmh: The parsed `OaiPmh` object. + + Raises: + ValueError: If there is an error parsing the response text, a `ValueError` will be raised with an informative error message. """ + parsed = parser.from_string(text, OaiPmh) + raise_for_error(parsed.error) + return parsed + - http_response: Response - params: dict[str, str] +def raise_for_error(errors: list[OaiPmherror] | None) -> None: + """Raise an exception based on the parsed error data. - @property - def raw(self) -> str: - """Return the raw text of the server's response as a unicode string.""" - return self.http_response.text + Args: + errors: The parsed oaipmh error data. - @property - def xml(self) -> etree._Element: - """Parse the server's response content and return it as an `etree._Element` object.""" - return etree.XML(self.http_response.content, parser=XMLParser) + Returns: + None + """ + if not errors: + return + for error in errors: + code = error.code.value + description = error.value + try: + exception_name = code[0].upper() + code[1:] + raise getattr(exceptions, exception_name)(description) + except AttributeError as exc: + raise exceptions.GeneralOAIPMHError(description) from exc - def __str__(self) -> str: - verb = self.params.get("verb") - return f"" + +@dataclass +class Response: + status_code: httpx.codes + headers: httpx.Headers + content: bytes + parsed: OaiPmh diff --git a/src/oaipmh_scythe/utils.py b/src/oaipmh_scythe/utils.py index e07a157..af1ff9a 100644 --- a/src/oaipmh_scythe/utils.py +++ b/src/oaipmh_scythe/utils.py @@ -13,22 +13,18 @@ log_response: Log the details of an HTTP response. remove_none_values: Remove keys from the dictionary where the value is `None`. filter_dict_except_resumption_token: Filter keys from the dictionary, if resumption token is not `None`. - get_namespace: Extracts the namespace from an XML element. - xml_to_dict: Converts an XML tree or element into a dictionary representation. """ from __future__ import annotations import logging -import re -from collections import defaultdict from typing import TYPE_CHECKING if TYPE_CHECKING: from typing import Any import httpx - from lxml import etree + logger = logging.getLogger(__name__) @@ -84,56 +80,3 @@ def filter_dict_except_resumption_token(d: dict[str, Any | None]) -> dict[str, A ) return {k: v for k, v in d.items() if k in allowed_keys} return d - - -def get_namespace(element: etree._Element) -> str | None: - """Return the namespace URI of an XML element. - - Extracts and returns the namespace URI from the tag of the given XML element. - The namespace URI is enclosed in curly braces at the start of the tag. - If the element does not have a namespace, `None` is returned. - - Args: - element: The XML element from which to extract the namespace. - - Returns: - The namespace URI as a string if the element has a namespace, otherwise `None`. - """ - match = re.search(r"(\{.*\})", element.tag) - return match.group(1) if match else None - - -def xml_to_dict( - tree: etree._Element, paths: list[str] | None = None, nsmap: dict[str, str] | None = None, strip_ns: bool = False -) -> dict[str, list[str | None]]: - """Convert an XML tree to a dictionary, with options for custom XPath and namespace handling. - - This function takes an XML element tree and converts it into a dictionary. The keys of the - dictionary are the tags of the XML elements, and the values are lists of the text contents - of these elements. It offers options to apply specific XPath expressions, handle namespaces, - and optionally strip namespaces from the tags in the resulting dictionary. - - Args: - tree: The root element of the XML tree to be converted. - paths: An optional list of XPath expressions to apply on the XML tree. If None or not - provided, the function will consider all elements in the tree. - nsmap: An optional dictionary for namespace mapping, used to provide shorter, more - readable paths in XPath expressions. If None or not provided, no namespace - mapping is applied. - strip_ns: A boolean flag indicating whether to remove namespaces from the element tags - in the resulting dictionary. Defaults to False. - - Returns: - A dictionary where each key is an element tag (with or without namespace, based on - `strip_ns`) and each value is a list of strings representing the text content of - each element with that tag. - """ - paths = paths or [".//"] - nsmap = nsmap or {} - fields = defaultdict(list) - for path in paths: - elements = tree.findall(path, nsmap) - for element in elements: - tag = re.sub(r"\{.*\}", "", element.tag) if strip_ns else element.tag - fields[tag].append(element.text) - return dict(fields) diff --git a/tests/integration/test_get_record.py b/tests/integration/test_get_record.py index f5163f0..04d0f57 100644 --- a/tests/integration/test_get_record.py +++ b/tests/integration/test_get_record.py @@ -7,8 +7,9 @@ from typing import TYPE_CHECKING import pytest -from httpx import HTTPStatusError + +from oaipmh_scythe.exceptions import BadArgument, IdDoesNotExist from oaipmh_scythe.models import Record if TYPE_CHECKING: @@ -45,14 +46,12 @@ def test_get_record_with_valid_metadata_prefix(scythe: Scythe) -> None: @pytest.mark.default_cassette("get_record.yaml") @pytest.mark.vcr() def test_get_record_with_invalid_metadata_prefix(scythe: Scythe) -> None: - with pytest.raises(HTTPStatusError): - # cannotDisseminateFormat + with pytest.raises(BadArgument): scythe.get_record(identifier=IDENTIFIER, metadata_prefix="XXX") @pytest.mark.default_cassette("id_does_not_exist.yaml") @pytest.mark.vcr() def test_get_record_with_invalid_identifier(scythe: Scythe) -> None: - # idDoesNotExist - with pytest.raises(HTTPStatusError): + with pytest.raises(IdDoesNotExist): scythe.get_record(identifier="oai:zenodo.org:XXX", metadata_prefix="oai_dc") diff --git a/tests/integration/test_identify.py b/tests/integration/test_identify.py index b145b69..ae0b402 100644 --- a/tests/integration/test_identify.py +++ b/tests/integration/test_identify.py @@ -30,7 +30,7 @@ def test_context_manager() -> None: def test_identify(scythe: Scythe) -> None: identify = scythe.identify() assert isinstance(identify, Identify) - assert identify.repositoryName == "Zenodo" + assert identify.repository_name == "Zenodo" @pytest.mark.default_cassette("identify.yaml") diff --git a/tests/integration/test_list_identifiers.py b/tests/integration/test_list_identifiers.py index 9910ee3..b827859 100644 --- a/tests/integration/test_list_identifiers.py +++ b/tests/integration/test_list_identifiers.py @@ -10,7 +10,7 @@ import pytest from lxml import etree -from oaipmh_scythe import OAIResponse, Scythe +from oaipmh_scythe import Response, Scythe from oaipmh_scythe.iterator import OAIResponseIterator from oaipmh_scythe.models import Header diff --git a/tests/integration/test_list_records.py b/tests/integration/test_list_records.py index bd4e760..ba4e3b9 100644 --- a/tests/integration/test_list_records.py +++ b/tests/integration/test_list_records.py @@ -13,7 +13,7 @@ from oaipmh_scythe.iterator import OAIResponseIterator from oaipmh_scythe.models import Record -from oaipmh_scythe.response import OAIResponse +from oaipmh_scythe.response import Response if TYPE_CHECKING: from oaipmh_scythe import Scythe @@ -152,7 +152,7 @@ def test_list_records_oai_response(scythe: Scythe) -> None: # there are 3 canned responses in list_records.yaml assert len(responses) == 3 response = responses[0] - assert isinstance(response, OAIResponse) + assert isinstance(response, Response) assert response.params == {"metadataPrefix": "oai_dc", "verb": "ListRecords"} assert isinstance(response.xml, etree._Element) assert response.xml.tag == "{http://www.openarchives.org/OAI/2.0/}OAI-PMH" diff --git a/tests/integration/test_list_sets.py b/tests/integration/test_list_sets.py index f92cece..a47dc4d 100644 --- a/tests/integration/test_list_sets.py +++ b/tests/integration/test_list_sets.py @@ -26,7 +26,7 @@ def test_list_sets(scythe: Scythe) -> None: assert len(sets) == 10 s = sets[0] assert isinstance(s, Set) - assert s.setName == "European Middleware Initiative" + assert s.set_name == "European Middleware Initiative" @pytest.mark.default_cassette("list_sets.yaml") diff --git a/tests/unit/test_iterator.py b/tests/unit/test_iterator.py index de4374e..f905461 100644 --- a/tests/unit/test_iterator.py +++ b/tests/unit/test_iterator.py @@ -6,7 +6,7 @@ import pytest -from oaipmh_scythe import OAIResponse, Scythe +from oaipmh_scythe import Response, Scythe from oaipmh_scythe.iterator import OAIItemIterator, OAIResponseIterator from oaipmh_scythe.models import Header @@ -25,7 +25,7 @@ def test_iterator_str(scythe: Scythe) -> None: def test_oai_response_iterator(scythe: Scythe) -> None: iterator = OAIResponseIterator(scythe, query) responses = list(iterator) - assert isinstance(responses[0], OAIResponse) + assert isinstance(responses[0], Response) # there are 3 canned responses in list_identifiers.yaml assert len(responses) == 3 diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index b362424..c5aeea5 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -3,22 +3,7 @@ # # SPDX-License-Identifier: BSD-3-Clause -import pytest -from lxml import etree - -from oaipmh_scythe.utils import filter_dict_except_resumption_token, get_namespace, remove_none_values, xml_to_dict - - -@pytest.fixture() -def xml_element_with_namespace() -> etree._Element: - xml = 'https://zenodo.org/oai2d' - return etree.fromstring(xml) - - -@pytest.fixture() -def xml_element_without_namespace() -> etree._Element: - xml = 'https://zenodo.org/oai2d' - return etree.fromstring(xml) +from oaipmh_scythe.utils import filter_dict_except_resumption_token, remove_none_values def test_remove_none_values() -> None: @@ -44,40 +29,3 @@ def test_filter_dict_except_resumption_token_noop() -> None: d = {"resumptionToken": None, "verb": "ListRecords"} result = filter_dict_except_resumption_token(d) assert result == d - - -def test_get_namespace(xml_element_with_namespace: etree._Element) -> None: - namespace = get_namespace(xml_element_with_namespace) - assert namespace == "{http://www.openarchives.org/OAI/2.0/}" - - -def test_get_namespace_without_namespace(xml_element_without_namespace: etree._Element) -> None: - namespace = get_namespace(xml_element_without_namespace) - assert namespace is None - - -def test_xml_to_dict_default(xml_element_with_namespace: etree._Element) -> None: - result = xml_to_dict(xml_element_with_namespace) - expected = {"{http://www.openarchives.org/OAI/2.0/}request": ["https://zenodo.org/oai2d"]} - assert result == expected - - -def test_xml_to_dict_with_paths(xml_element_with_namespace: etree._Element) -> None: - result = xml_to_dict(xml_element_with_namespace, paths=["./{http://www.openarchives.org/OAI/2.0/}request"]) - expected = { - "{http://www.openarchives.org/OAI/2.0/}request": ["https://zenodo.org/oai2d"], - } - assert result == expected - - -def test_xml_to_dict_with_nsmap(xml_element_with_namespace: etree._Element) -> None: - nsmap = {"oai": "http://www.openarchives.org/OAI/2.0/"} - result = xml_to_dict(xml_element_with_namespace, paths=["oai:request"], nsmap=nsmap) - expected = {"{http://www.openarchives.org/OAI/2.0/}request": ["https://zenodo.org/oai2d"]} - assert result == expected - - -def test_xml_to_dict_strip_namespace(xml_element_with_namespace: etree._Element) -> None: - result = xml_to_dict(xml_element_with_namespace, strip_ns=True) - expected = {"request": ["https://zenodo.org/oai2d"]} - assert result == expected