diff --git a/openverse_catalog/dags/providers/provider_api_scripts/nypl.py b/openverse_catalog/dags/providers/provider_api_scripts/nypl.py index cadf8413cc1..b226ab75758 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/nypl.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/nypl.py @@ -1,211 +1,290 @@ import logging +import re from urllib.parse import parse_qs, urlparse from airflow.models import Variable +from common import constants from common.licenses import get_license_info from common.loader import provider_details as prov -from common.requester import DelayedRequester -from common.storage.image import ImageStore -from requests.exceptions import JSONDecodeError +from common.loader.provider_details import ImageCategory +from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester -logging.basicConfig( - format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.INFO -) logger = logging.getLogger(__name__) -LIMIT = 500 -DELAY = 1.0 -RETRIES = 3 -PROVIDER = prov.NYPL_DEFAULT_PROVIDER -BASE_ENDPOINT = "http://api.repo.nypl.org/api/v1/items/search" -METADATA_ENDPOINT = "http://api.repo.nypl.org/api/v1/items/item_details/" -NYPL_API = Variable.get("API_KEY_NYPL", default_var=None) -TOKEN = f"Token token={NYPL_API}" -delay_request = DelayedRequester(delay=DELAY) -image_store = ImageStore(provider=PROVIDER) +def get_value_from_dict_or_list( + dict_or_list: dict | list, keys: list[str] +) -> dict | list | str | None: + """ + Returns the nested value. + If dict_or_list is a list, returns the value from the first + dictionary in the list. + If it is a dict, returns the value from the dict. + """ + if not keys or not dict_or_list: + return dict_or_list + current_key, *updated_keys = keys + if isinstance(dict_or_list, list): + for item in dict_or_list: + if current_key in item: + val = item[current_key] + if isinstance(val, (list, dict)): + return get_value_from_dict_or_list(val, updated_keys) + elif isinstance(val, str) or val is None: + return val + return None + elif isinstance(dict_or_list, dict): + val = dict_or_list.get(current_key) + if isinstance(val, (list, dict)): + return get_value_from_dict_or_list(val, updated_keys) + return val + + +class NyplDataIngester(ProviderDataIngester): + providers = {"image": prov.NYPL_DEFAULT_PROVIDER} + endpoint_base = "http://api.repo.nypl.org/api/v1/items" + endpoint = f"{endpoint_base}/search/" + metadata_endpoint = f"{endpoint_base}/item_details/" + batch_limit = 500 + delay = 5 + # NYPL returns a list of image objects, with the dimension encoded + # in the URL's query parameter. + # This list is in order from the largest image to the smallest one. + image_url_dimensions = ["g", "v", "q", "w", "r"] + + def __init__(self, *args, **kwargs): + NYPL_API = Variable.get("API_KEY_NYPL") + NyplDataIngester.headers = {"Authorization": f"Token token={NYPL_API}"} + super().__init__(*args, **kwargs) + + def get_next_query_params(self, prev_query_params, **kwargs): + if not prev_query_params: + return { + "q": "CC_0", + "field": "use_rtxt_s", + "page": 1, + "per_page": self.batch_limit, + } -DEFAULT_QUERY_PARAMS = { - "q": "CC_0", - "field": "use_rtxt_s", - "page": 1, - "per_page": LIMIT, -} - -HEADERS = {"Authorization": TOKEN} - -IMAGE_URL_DIMENSIONS = ["g", "v", "q", "w", "r"] - - -def main(): - page = 1 - condition = True - while condition: - query_param = _get_query_param(page=page) - request_response = _request_handler(params=query_param) - results = request_response.get("result") - if type(results) == list and len(results) > 0: - _handle_results(results) - logger.info(f"{image_store.total_items} images till now") - page = page + 1 else: - condition = False - image_store.commit() - logger.info(f"total images {image_store.total_items}") - - -def _get_query_param( - default_query_params=None, - page=1, -): - if default_query_params is None: - default_query_params = DEFAULT_QUERY_PARAMS - query_param = default_query_params.copy() - query_param["page"] = page - return query_param - - -def _request_handler( - endpoint=BASE_ENDPOINT, params=None, headers=None, retries=RETRIES -): - if headers is None: - headers = HEADERS.copy() - results = None - for retry in range(retries): - response = delay_request.get(endpoint, params=params, headers=headers) - if response.status_code == 200: - try: - response_json = response.json() - response_json = response_json.get("nyplAPI") - results = response_json.get("response") - break - - except JSONDecodeError as e: - logger.warning(f"Request failed due to {e}") - results = None - else: - results = None - return results - - -def _handle_results(results): - for item in results: - uuid = item.get("uuid") - - item_details = _request_handler( - endpoint=METADATA_ENDPOINT + uuid, + # Increment `skip` by the batch limit. + return { + **prev_query_params, + "page": prev_query_params["page"] + 1, + } + + def get_media_type(self, record): + # This provider only supports Images. + return constants.IMAGE + + def get_batch_data(self, response_json): + if response_json: + return response_json.get("nyplAPI", {}).get("response", {}).get("result") + return None + + def get_record_data(self, data): + uuid = data.get("uuid") + + item_json = ( + self.get_response_json({}, endpoint=self.metadata_endpoint + uuid) or {} ) - if item_details is None: - continue - + item_details = item_json.get("nyplAPI", {}).get("response") + if not item_details: + return None mods = item_details.get("mods") - title = _get_title(mods.get("titleInfo")) - creator = _get_creators(mods.get("name")) - metadata = _get_metadata(mods) - - captures = item_details.get("sibling_captures", {}).get("capture", []) - if type(captures) is not list: - captures = [captures] - - _get_capture_details( - captures=captures, metadata=metadata, creator=creator, title=title - ) - - -def _get_capture_details(captures=None, metadata=None, creator=None, title=None): - if captures is None: - captures = [] - for img in captures: - image_id = img.get("imageID", {}).get("$") - if image_id is None: - continue - image_url = _get_image_url(img.get("imageLinks", {}).get("imageLink", [])) - foreign_landing_url = img.get("itemLink", {}).get("$") - license_url = img.get("rightsStatementURI", {}).get("$") - if image_url is None or foreign_landing_url is None or license_url is None: - continue - - image_store.add_item( - foreign_identifier=image_id, - foreign_landing_url=foreign_landing_url, - image_url=image_url, - license_info=get_license_info(license_url=license_url), - title=title, - creator=creator, - meta_data=metadata, - ) + title_info = mods.get("titleInfo") + if isinstance(title_info, list) and title_info: + title_info = title_info[0] + title = "" if title_info is None else title_info.get("title", {}).get("$") -def _get_title(titleinfo): - title = None - if type(titleinfo) == list and len(titleinfo) > 0: - title = titleinfo[0].get("title", {}).get("$") - return title + name_properties = mods.get("name") + creator = self._get_creators(name_properties) if name_properties else None - -def _get_creators(creatorinfo): - if type(creatorinfo) == list: - primary_creator = ( - info.get("namePart", {}).get("$") - for info in creatorinfo - if info.get("usage") == "primary" + metadata = self._get_metadata(mods) + category = ( + ImageCategory.PHOTOGRAPH.value + if metadata.get("genre") == "Photographs" + else None ) - creator = next(primary_creator, None) - else: - creator = None - - if creator is None: - logger.warning("No primary creator found") - - return creator - - -def _get_image_url(images, image_url_dimensions=None): - if image_url_dimensions is None: - image_url_dimensions = IMAGE_URL_DIMENSIONS - image_type = { - parse_qs(urlparse(img.get("$")).query)["t"][0]: img.get("$") for img in images - } - - image_url = _get_preferred_image(image_type, image_url_dimensions) - - return image_url - - -def _get_preferred_image(image_type, dimension_list): - preferred_image = ( - image_type.get(dimension).replace("&download=1", "") - for dimension in dimension_list - if dimension in image_type - ) - return next(preferred_image, None) - - -def _get_metadata(mods): - metadata = {} - - type_of_resource = mods.get("typeOfResource") - if isinstance(type_of_resource, list) and ( - type_of_resource[0].get("usage") == "primary" - ): - metadata["type_of_resource"] = type_of_resource[0].get("$") - - if isinstance(mods.get("genre"), dict): - metadata["genre"] = mods.get("genre").get("$") - - origin_info = mods.get("originInfo") - if date_issued := origin_info.get("dateIssued", {}).get("$"): - metadata["date_issued"] = date_issued - if publisher := origin_info.get("publisher", {}).get("$"): - metadata["publisher"] = publisher + captures = item_details.get("sibling_captures", {}).get("capture") + if not captures: + return None + if not isinstance(captures, list): + captures = [captures] + images = [] + for capture in captures: + image_id = capture.get("imageID", {}).get("$") + if image_id is None: + continue + + image_link = capture.get("imageLinks", {}).get("imageLink", []) + image_url, filetype = self._get_image_data(image_link) + if not image_url: + continue + + foreign_landing_url = capture.get("itemLink", {}).get("$") + license_url = capture.get("rightsStatementURI", {}).get("$") + if not foreign_landing_url or license_url is None: + continue + + image_data = { + "foreign_identifier": image_id, + "foreign_landing_url": foreign_landing_url, + "image_url": image_url, + "license_info": get_license_info(license_url=license_url), + "title": title, + "creator": creator, + "filetype": filetype, + "category": category, + "meta_data": metadata, + } + images.append(image_data) + return images + + @staticmethod + def _get_filetype(description: str): + """ + Extracts the filetype from a description string like: + "Cropped .jpeg (1600 pixels on the long side)" + This is required because the filetype is not present/extractable from the + url via the MediaStore class. + :param description: the description string + :return: jpeg | gif + """ + if match := re.search(r" .(jpeg|gif) ", description): + return match.group(1) + return None + + @staticmethod + def _get_image_data(images) -> tuple[None, None] | tuple[str, str]: + """ + Receives a list of dictionaries of the following shape: + { + "$": "http://images.nypl.org/index.php?id=56738467&t=q&download=1 + &suffix=29eed1f0-3d50-0134-c4c7-00505686a51c.001", + "description": "Cropped .jpeg (1600 pixels on the long side)" + } + Selects the largest image based on the image URL's `t` query parameter + and image_url_dimensions. + """ + # Create a dict with the NyplDataIngester.image_url_dimensions as keys, + # and image data as value. + image_types = { + parse_qs(urlparse(img["$"]).query)["t"][0]: i + for i, img in enumerate(images) + } + if not image_types: + return None, None + + # Select the dict containing the URL for the largest image. + # The image size is encoded in the URL query parameter `t`. + # The list of dimensions is sorted by size of the corresponding image. + for dimension in NyplDataIngester.image_url_dimensions: + preferred_image_index = image_types.get(dimension) + if preferred_image_index is not None: + preferred_image = images[preferred_image_index] + + # Removes the `download` query to get the viewable image URL + image_url = preferred_image["$"].replace("&download=1", "") + filetype = NyplDataIngester._get_filetype( + preferred_image["description"] + ) + return image_url, filetype + + return None, None + + @staticmethod + def _get_creators(creatorinfo): + if not isinstance(creatorinfo, list): + creatorinfo = [creatorinfo] + for info in creatorinfo: + if info.get("usage") == "primary": + return info.get("namePart", {}).get("$") + return None + + @staticmethod + def _get_type_of_resource(mods: dict) -> str | None: + type_of_resource = mods.get("typeOfResource", {}) + if isinstance(type_of_resource, list): + for type_ in type_of_resource: + if type_.get("usage") == "primary": + return type_.get("$") + return None + else: + return type_of_resource.get("$") + + @staticmethod + def _get_metadata(mods): + metadata = {} + + if type_of_resource := NyplDataIngester._get_type_of_resource(mods): + metadata["type_of_resource"] = type_of_resource + + if genre := get_value_from_dict_or_list(mods, ["genre", "$"]): + metadata["genre"] = genre + + origin_info = mods.get("originInfo", {}) + + if date_issued := get_value_from_dict_or_list( + mods, ["originInfo", "dateIssued", "$"] + ): + metadata["date_issued"] = date_issued + + if date_created_object := get_value_from_dict_or_list( + origin_info, ["dateCreated"] + ): + if isinstance(date_created_object, dict): + if date_created := date_created_object.get("$"): + metadata["date_created"] = date_created + elif isinstance(date_created_object, list): + # Approximate dates have a start and an end + # [{'encoding': 'w3cdtf', 'keyDate': 'yes', + # 'point': 'start', 'qualifier': 'approximate', '$': '1990'}, + # {'encoding': 'w3cdtf', 'point': 'end', + # 'qualifier': 'approximate', '$': '1999'}] + start, end = None, None + for item in date_created_object: + point = item.get("point") + if point == "start": + start = item.get("$") + elif point == "end": + end = item.get("$") + if start: + metadata["date_created"] = f"{start}{f'-{end}' if end else ''}" + + if publisher := get_value_from_dict_or_list(origin_info, ["publisher", "$"]): + metadata["publisher"] = publisher + + if physical_description := get_value_from_dict_or_list( + mods, ["physicalDescription", "note", "$"] + ): + metadata["physical_description"] = physical_description + + subject_list = mods.get("subject", []) + if isinstance(subject_list, dict): + subject_list = [subject_list] + # Topic can be a dictionary or a list + topics = [subject["topic"] for subject in subject_list if "topic" in subject] + if topics: + tags = [] + for topic in topics: + if isinstance(topic, list): + tags.extend([t.get("$") for t in topic]) + else: + tags.append(topic.get("$")) + if tags: + metadata["tags"] = ", ".join(tags) + + return metadata - physical_description = mods.get("physicalDescription") - if description := physical_description.get("note", {}).get("$"): - metadata["description"] = description - return metadata +def main(): + logger.info("Begin: NYPL data ingestion") + ingester = NyplDataIngester() + ingester.ingest_records() if __name__ == "__main__": diff --git a/openverse_catalog/dags/providers/provider_workflows.py b/openverse_catalog/dags/providers/provider_workflows.py index eceb0a63282..55b9c53e2c6 100644 --- a/openverse_catalog/dags/providers/provider_workflows.py +++ b/openverse_catalog/dags/providers/provider_workflows.py @@ -11,6 +11,7 @@ from providers.provider_api_scripts.jamendo import JamendoDataIngester from providers.provider_api_scripts.metropolitan_museum import MetMuseumDataIngester from providers.provider_api_scripts.museum_victoria import VictoriaDataIngester +from providers.provider_api_scripts.nypl import NyplDataIngester from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester from providers.provider_api_scripts.science_museum import ScienceMuseumDataIngester from providers.provider_api_scripts.smk import SmkDataIngester @@ -174,6 +175,7 @@ def __post_init__(self): ), ProviderWorkflow( provider_script="nypl", + ingestion_callable=NyplDataIngester, start_date=datetime(2020, 1, 1), ), ProviderWorkflow( diff --git a/tests/dags/providers/provider_api_scripts/resources/nypl/metadata.json b/tests/dags/providers/provider_api_scripts/resources/nypl/metadata.json index 167d1d0880f..a99515fec99 100644 --- a/tests/dags/providers/provider_api_scripts/resources/nypl/metadata.json +++ b/tests/dags/providers/provider_api_scripts/resources/nypl/metadata.json @@ -1,7 +1,8 @@ { "date_issued": "1981", - "description": "4 polyester film encapsulations, some containing 2 sheets back-to-back. Accompanying text formatted as 1 large sheet (46 x 59 cm), in one of the encapsulations.", "genre": "Maps", + "physical_description": "4 polyester film encapsulations, some containing 2 sheets back-to-back. Accompanying text formatted as 1 large sheet (46 x 59 cm), in one of the encapsulations.", "publisher": "New York Public Library, Local History and Genealogy Division", + "tags": "Census districts", "type_of_resource": "cartographic" } diff --git a/tests/dags/providers/provider_api_scripts/test_nypl.py b/tests/dags/providers/provider_api_scripts/test_nypl.py index 52dc8939f72..8208f534207 100644 --- a/tests/dags/providers/provider_api_scripts/test_nypl.py +++ b/tests/dags/providers/provider_api_scripts/test_nypl.py @@ -1,120 +1,77 @@ import json -import logging -import os -from unittest.mock import MagicMock, patch +from pathlib import Path +from unittest.mock import patch + +import pytest +from common.licenses import LicenseInfo +from providers.provider_api_scripts.nypl import ( + NyplDataIngester, + get_value_from_dict_or_list, +) -import requests -from providers.provider_api_scripts import nypl as np +RESOURCES = Path(__file__).parent / "resources/nypl" +CC0 = LicenseInfo( + license="cc0", + version="1.0", + url="https://creativecommons.org/publicdomain/zero/1.0/", + raw_url="https://creativecommons.org/publicdomain/zero/1.0/", +) -RESOURCES = os.path.join(os.path.abspath(os.path.dirname(__file__)), "resources/nypl") -logging.basicConfig( - format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.DEBUG -) +@pytest.fixture(autouse=True) +def validate_url_string(): + with patch("common.urls.rewrite_redirected_url") as mock_validate_url_string: + mock_validate_url_string.side_effect = lambda x: x + yield + + +nypl = NyplDataIngester() +image_store = nypl.media_stores["image"] def _get_resource_json(json_name): - with open(os.path.join(RESOURCES, json_name)) as f: + with open(RESOURCES / json_name) as f: resource_json = json.load(f) - return resource_json + return resource_json -def test_get_query_param_default(): - actual_param = np._get_query_param() +def test_get_next_query_params_default(): + actual_param = nypl.get_next_query_params({}) expected_param = {"q": "CC_0", "field": "use_rtxt_s", "page": 1, "per_page": 500} - assert actual_param == expected_param -def test_get_query_param_offset(): - actual_param = np._get_query_param(page=10) - expected_param = {"q": "CC_0", "field": "use_rtxt_s", "page": 10, "per_page": 500} +def test_get_next_query_params_increments_offset(): + previous_query_params = { + "q": "CC_0", + "field": "use_rtxt_s", + "page": 10, + "per_page": 500, + } + actual_param = nypl.get_next_query_params(previous_query_params) + expected_param = {"q": "CC_0", "field": "use_rtxt_s", "page": 11, "per_page": 500} assert actual_param == expected_param -def test_request_handler_search_success(): - query_param = {"q": "CC_0", "field": "use_rtxt_s", "page": 12, "per_page": 1} - +def test_get_batch_data_success(): response_search_success = _get_resource_json("response_search_success.json") - r = requests.Response() - r.status_code = 200 - r.json = MagicMock(return_value=response_search_success) - with patch.object(np.delay_request, "get", return_value=r) as mock_call: - actual_response = np._request_handler(params=query_param) - - expected_response = response_search_success.get("nyplAPI", {}).get("response") - - assert actual_response == expected_response - assert mock_call.call_count == 1 + actual_response = nypl.get_batch_data(response_search_success) + assert len(actual_response) == 1 -def test_request_handler_itemdetail_success(): - response_itemdetails_success = _get_resource_json( - "response_itemdetails_success.json" - ) - r = requests.Response() - r.status_code = 200 - r.json = MagicMock(return_value=response_itemdetails_success) - with patch.object(np.delay_request, "get", return_value=r) as mock_call: - actual_response = np._request_handler( - endpoint=np.METADATA_ENDPOINT + ("0cabe3d0-3d50-0134-a8e0-00505686a51c"), - ) - expected_response = response_itemdetails_success.get("nyplAPI", {}).get("response") +def test_get_batch_data_failure(): + response_search_failure = {} + actual_response = nypl.get_batch_data(response_search_failure) - assert actual_response == expected_response - assert mock_call.call_count == 1 - - -def test_request_handler_failure(): - query_param = {"q": "CC_0", "field": "use_rtxt_s", "page": 12, "per_page": 1} - - r = requests.Response() - r.status_code = 400 - r.json = MagicMock(return_value={}) - with patch.object(np.delay_request, "get", return_value=r) as mock_call: - actual_response = np._request_handler(params=query_param) - assert mock_call.call_count == 3 assert actual_response is None -def test_get_images_success(): - images = _get_resource_json("images.json") - actual_image_url = np._get_image_url(images) - - assert actual_image_url == ( - "http://images.nypl.org/index.php?id=56738462&t=g&suffix=0cabe3d0-" - "3d50-0134-a8e0-00505686a51c.001" - ) - - -def test_get_image_failure(): - images = [] - actual_image_url = np._get_image_url(images) - - assert actual_image_url is None - - -def test_get_title_success(): - titleinfo = _get_resource_json("title_info_success.json") - actual_title = np._get_title(titleinfo) - expected_title = "1900 census enumeration districts, Manhattan and Bronx" - - assert actual_title == expected_title - - -def test_get_title_failure(): - titleinfo = [] - actual_title = np._get_title(titleinfo) - - assert actual_title is None - - def test_get_creators_success(): creatorinfo = _get_resource_json("creator_info_success.json") - actual_creator = np._get_creators(creatorinfo) + actual_creator = nypl._get_creators(creatorinfo) expected_creator = "Hillman, Barbara" assert actual_creator == expected_creator @@ -122,7 +79,7 @@ def test_get_creators_success(): def test_get_creators_failure(): creatorinfo = [] - actual_creator = np._get_creators(creatorinfo) + actual_creator = nypl._get_creators(creatorinfo) assert actual_creator is None @@ -130,7 +87,7 @@ def test_get_creators_failure(): def test_get_metadata(): item_response = _get_resource_json("response_itemdetails_success.json") mods = item_response.get("nyplAPI").get("response").get("mods") - actual_metadata = np._get_metadata(mods) + actual_metadata = nypl._get_metadata(mods) expected_metadata = _get_resource_json("metadata.json") assert actual_metadata == expected_metadata @@ -145,54 +102,97 @@ def test_get_metadata_missing_attrs(): mods["physicalDescription"].pop("note") # Remove data from expected values too expected_metadata = _get_resource_json("metadata.json") - for attr in ["date_issued", "publisher", "description"]: + for attr in ["date_issued", "publisher", "physical_description"]: expected_metadata.pop(attr) - actual_metadata = np._get_metadata(mods) + actual_metadata = nypl._get_metadata(mods) assert actual_metadata == expected_metadata -def test_handle_results_success(): +def test_get_record_data_success(): search_response = _get_resource_json("response_search_success.json") - result = search_response.get("nyplAPI").get("response").get("result") - + result = search_response["nyplAPI"]["response"]["result"][0] item_response = _get_resource_json("response_itemdetails_success.json") - r = requests.Response() - r.status_code = 200 - r.json = MagicMock(return_value=item_response) - with patch.object(np.delay_request, "get", return_value=r) as mock_request: - with patch.object(np.image_store, "add_item") as mock_item: - np._handle_results(result) - - assert mock_item.call_count == 7 - assert mock_request.call_count == 1 - -def test_handle_results_failure(): + with patch.object(nypl, "get_response_json", return_value=item_response): + images = nypl.get_record_data(result) + assert len(images) == 7 + expected_image = { + "category": None, + "creator": "Hillman, Barbara", + "filetype": "jpeg", + "foreign_identifier": "56738462", + "foreign_landing_url": "http://digitalcollections.nypl.org/items/0cabe3d0-3d50-0134-a8e0-00505686a51c", + "image_url": "http://images.nypl.org/index.php?id=56738462&t=g&suffix=0cabe3d0-3d50-0134-a8e0-00505686a51c.001", + "meta_data": { + "date_issued": "1981", + "genre": "Maps", + "publisher": "New York Public Library, Local History and Genealogy Division", + "tags": "Census districts", + "type_of_resource": "cartographic", + "physical_description": "4 polyester film encapsulations, some containing 2 sheets back-to-back. " + "Accompanying text formatted as 1 large sheet (46 x 59 cm), in one of " + "the encapsulations.", + }, + "title": "1900 census enumeration districts, Manhattan and Bronx", + "license_info": CC0, + } + assert images[0] == expected_image + + +def test_get_record_data_failure(): search_response = _get_resource_json("response_search_success.json") - result = search_response.get("nyplAPI").get("response").get("result") + result = search_response["nyplAPI"]["response"]["result"][0] item_response = None - with patch.object(np, "_request_handler", return_value=item_response): - with patch.object(np.image_store, "add_item") as mock_item: - np._handle_results(result) - - assert mock_item.call_count == 0 - - -def test_get_capture_detail_success(): - captures = _get_resource_json("capture_details.json") - - with patch.object(np.image_store, "add_item") as mock_item: - np._get_capture_details(captures=captures) - assert mock_item.call_count == 7 - - -def test_get_capture_detail_failure(): - captures = [] - - with patch.object(np.image_store, "add_item") as mock_item: - np._get_capture_details(captures=captures) - - assert mock_item.call_count == 0 + with patch.object(nypl, "get_response_json", return_value=item_response): + images = nypl.get_record_data(result) + assert images is None + + +@pytest.mark.parametrize( + "dict_or_list, keys, expected", + [ + ({"genre": None}, [], {"genre": None}), + ({"genre": None}, ["$"], None), + ([{"genre": None}], ["$"], None), + ( + { + "genre": { + "$": "Maps", + "authority": "lctgm", + "valueURI": "http://id.loc.gov/vocabulary/graphicMaterials/tgm006261", + } + }, + ["genre"], + { + "$": "Maps", + "authority": "lctgm", + "valueURI": "http://id.loc.gov/vocabulary/graphicMaterials/tgm006261", + }, + ), + ( + { + "genre": { + "$": "Maps", + "authority": "lctgm", + "valueURI": "http://id.loc.gov/vocabulary/graphicMaterials/tgm006261", + } + }, + ["genre", "$"], + "Maps", + ), + ({"a": [{"b": "b_value"}, {"c": "c_value"}]}, ["a", "c"], "c_value"), + ], + ids=[ + "empty list of keys", + "key not present in a dict", + "key not present in a list", + "return a dict value with one key", + "return a string value with a list of keys", + "return a string value with a list of keys, from a list", + ], +) +def test_get_value_from_dict_or_list(keys, dict_or_list, expected): + assert get_value_from_dict_or_list(dict_or_list, keys) == expected