Skip to content

Commit

Permalink
Update NYPL DAG to use API v2 (#3774)
Browse files Browse the repository at this point in the history
* Allow arguments to be handed into ipython recipe

* Update NYPL URL and test references

* Put tags in the appropriate database field

* Add unit test for tags extraction
  • Loading branch information
AetherUnbound authored Feb 16, 2024
1 parent 4632941 commit 78d799c
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 21 deletions.
34 changes: 18 additions & 16 deletions catalog/dags/providers/provider_api_scripts/nypl.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def get_value_from_dict_or_list(

class NyplDataIngester(ProviderDataIngester):
providers = {"image": prov.NYPL_DEFAULT_PROVIDER}
endpoint_base = "http://api.repo.nypl.org/api/v1/items"
endpoint_base = "http://api.repo.nypl.org/api/v2/items"
endpoint = f"{endpoint_base}/search/"
metadata_endpoint = f"{endpoint_base}/item_details/"
batch_limit = 500
Expand Down Expand Up @@ -138,6 +138,7 @@ def get_record_data(self, data):
"filetype": filetype,
"category": category,
"meta_data": metadata,
"raw_tags": NyplDataIngester._get_tags(mods) or None,
}
images.append(image_data)
return images
Expand Down Expand Up @@ -221,6 +222,22 @@ def _get_creators(creatorinfo):
return info.get("namePart", {}).get("$")
return None

@staticmethod
def _get_tags(mods: dict) -> list[str]:
subject_list = mods.get("subject", [])
if isinstance(subject_list, dict):
subject_list = [subject_list]
# Topic can be a dictionary or a list
topics = [subject["topic"] for subject in subject_list if "topic" in subject]
tags = []
if topics:
for topic in topics:
if isinstance(topic, list):
tags.extend([t.get("$") for t in topic])
else:
tags.append(topic.get("$"))
return [tag for tag in tags if tag]

@staticmethod
def _get_type_of_resource(mods: dict) -> str | None:
type_of_resource = mods.get("typeOfResource", {})
Expand Down Expand Up @@ -279,21 +296,6 @@ def _get_metadata(mods):
):
metadata["physical_description"] = physical_description

subject_list = mods.get("subject", [])
if isinstance(subject_list, dict):
subject_list = [subject_list]
# Topic can be a dictionary or a list
topics = [subject["topic"] for subject in subject_list if "topic" in subject]
if topics:
tags = []
for topic in topics:
if isinstance(topic, list):
tags.extend([t.get("$") for t in topic])
else:
tags.append(topic.get("$"))
if tags:
metadata["tags"] = ", ".join(tags)

return metadata


Expand Down
4 changes: 2 additions & 2 deletions catalog/justfile
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,12 @@ shell:
env DC_USER="airflow" just ../exec {{ SERVICE }} /bin/bash

# Launch an IPython shell in a new container under `SERVICE`
ipython: up-deps
ipython *args: up-deps
env DC_USER="airflow" just ../run \
--rm \
--workdir /opt/airflow/catalog/dags \
{{ SERVICE }} \
bash -c \'ipython\'
bash -c \'ipython {{ args }}\'

# Launch a `pgcli` shell in the PostgreSQL container
pgcli db_user_pass="deploy" db_name="openledger": up
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@
"genre": "Maps",
"physical_description": "4 polyester film encapsulations, some containing 2 sheets back-to-back. Accompanying text formatted as 1 large sheet (46 x 59 cm), in one of the encapsulations.",
"publisher": "New York Public Library, Local History and Genealogy Division",
"tags": "Census districts",
"type_of_resource": "cartographic"
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"numResults": "1275",
"result": [
{
"apiUri": "http://api.repo.nypl.org/api/v1/items/mods/0cabe3d0-3d50-0134-a8e0-00505686a51c",
"apiUri": "http://api.repo.nypl.org/api/v2/items/mods/0cabe3d0-3d50-0134-a8e0-00505686a51c",
"imageID": "56738462",
"itemLink": "http://digitalcollections.nypl.org/items/0cabe3d0-3d50-0134-a8e0-00505686a51c",
"rightsStatement": "To the extent that a jurisdiction grants The New York Public Library a copyright in this item, NYPL makes this item available under a Creative Commons CC0 1.0 Universal Public Domain Dedication. Though not required, if you want to credit us as the source, please use the following statement, \"From The New York Public Library,\" and provide a link back to the item on our Digital Collections site. Doing so helps us track how our collection is used and helps justify freely releasing even more content in the future.",
Expand Down
22 changes: 21 additions & 1 deletion catalog/tests/dags/providers/provider_api_scripts/test_nypl.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,26 @@ def test_get_creators_failure():
assert actual_creator is None


@pytest.mark.parametrize("subject_container", [lambda x: [x], lambda x: x])
@pytest.mark.parametrize("topic_container", [lambda x: [x], lambda x: x])
@pytest.mark.parametrize(
"topic, expected_tags",
[
# No topics
[{}, []],
# Unrelated topics
[{"Unrelated": "Foo"}, []],
# Relevant topics
[{"$": "value"}, ["value"]],
],
)
def test_get_tags(subject_container, topic_container, topic, expected_tags):
topics = topic_container(topic)
subject = subject_container({"topic": topics})
actual_tags = nypl._get_tags({"subject": subject})
assert actual_tags == expected_tags


def test_get_metadata():
item_response = _get_resource_json("response_itemdetails_success.json")
mods = item_response.get("nyplAPI").get("response").get("mods")
Expand Down Expand Up @@ -138,12 +158,12 @@ def test_get_record_data_success():
"date_issued": "1981",
"genre": "Maps",
"publisher": "New York Public Library, Local History and Genealogy Division",
"tags": "Census districts",
"type_of_resource": "cartographic",
"physical_description": "4 polyester film encapsulations, some containing 2 sheets back-to-back. "
"Accompanying text formatted as 1 large sheet (46 x 59 cm), in one of "
"the encapsulations.",
},
"raw_tags": ["Census districts"],
"title": "1900 census enumeration districts, Manhattan and Bronx",
"license_info": CC0,
}
Expand Down

0 comments on commit 78d799c

Please sign in to comment.