Update NYPL DAG to use API v2 (#3774)

* Allow arguments to be handed into ipython recipe * Update NYPL URL and test references * Put tags in the appropriate database field * Add unit test for tags extraction
WordPress · Feb 16, 2024 · 78d799c · 78d799c
1 parent 4632941
commit 78d799c
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 21 deletions.
diff --git a/catalog/dags/providers/provider_api_scripts/nypl.py b/catalog/dags/providers/provider_api_scripts/nypl.py
@@ -44,7 +44,7 @@ def get_value_from_dict_or_list(
 
 class NyplDataIngester(ProviderDataIngester):
     providers = {"image": prov.NYPL_DEFAULT_PROVIDER}
-    endpoint_base = "http://api.repo.nypl.org/api/v1/items"
+    endpoint_base = "http://api.repo.nypl.org/api/v2/items"
     endpoint = f"{endpoint_base}/search/"
     metadata_endpoint = f"{endpoint_base}/item_details/"
     batch_limit = 500
@@ -138,6 +138,7 @@ def get_record_data(self, data):
                 "filetype": filetype,
                 "category": category,
                 "meta_data": metadata,
+                "raw_tags": NyplDataIngester._get_tags(mods) or None,
             }
             images.append(image_data)
         return images
@@ -221,6 +222,22 @@ def _get_creators(creatorinfo):
                 return info.get("namePart", {}).get("$")
         return None
 
+    @staticmethod
+    def _get_tags(mods: dict) -> list[str]:
+        subject_list = mods.get("subject", [])
+        if isinstance(subject_list, dict):
+            subject_list = [subject_list]
+        # Topic can be a dictionary or a list
+        topics = [subject["topic"] for subject in subject_list if "topic" in subject]
+        tags = []
+        if topics:
+            for topic in topics:
+                if isinstance(topic, list):
+                    tags.extend([t.get("$") for t in topic])
+                else:
+                    tags.append(topic.get("$"))
+        return [tag for tag in tags if tag]
+
     @staticmethod
     def _get_type_of_resource(mods: dict) -> str | None:
         type_of_resource = mods.get("typeOfResource", {})
@@ -279,21 +296,6 @@ def _get_metadata(mods):
         ):
             metadata["physical_description"] = physical_description
 
-        subject_list = mods.get("subject", [])
-        if isinstance(subject_list, dict):
-            subject_list = [subject_list]
-        # Topic can be a dictionary or a list
-        topics = [subject["topic"] for subject in subject_list if "topic" in subject]
-        if topics:
-            tags = []
-            for topic in topics:
-                if isinstance(topic, list):
-                    tags.extend([t.get("$") for t in topic])
-                else:
-                    tags.append(topic.get("$"))
-            if tags:
-                metadata["tags"] = ", ".join(tags)
-
         return metadata
 
 

diff --git a/catalog/justfile b/catalog/justfile
@@ -82,12 +82,12 @@ shell:
     env DC_USER="airflow" just ../exec {{ SERVICE }} /bin/bash
 
 # Launch an IPython shell in a new container under `SERVICE`
-ipython: up-deps
+ipython *args: up-deps
     env DC_USER="airflow" just ../run \
         --rm \
         --workdir /opt/airflow/catalog/dags \
         {{ SERVICE }} \
-        bash -c \'ipython\'
+        bash -c \'ipython {{ args }}\'
 
 # Launch a `pgcli` shell in the PostgreSQL container
 pgcli db_user_pass="deploy" db_name="openledger": up

diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/nypl/metadata.json b/catalog/tests/dags/providers/provider_api_scripts/resources/nypl/metadata.json
@@ -3,6 +3,5 @@
   "genre": "Maps",
   "physical_description": "4 polyester film encapsulations, some containing 2 sheets back-to-back. Accompanying text formatted as 1 large sheet (46 x 59 cm), in one of the encapsulations.",
   "publisher": "New York Public Library, Local History and Genealogy Division",
-  "tags": "Census districts",
   "type_of_resource": "cartographic"
 }
diff --git a/...log/tests/dags/providers/provider_api_scripts/resources/nypl/response_search_success.json b/...log/tests/dags/providers/provider_api_scripts/resources/nypl/response_search_success.json
@@ -17,7 +17,7 @@
       "numResults": "1275",
       "result": [
         {
-          "apiUri": "http://api.repo.nypl.org/api/v1/items/mods/0cabe3d0-3d50-0134-a8e0-00505686a51c",
+          "apiUri": "http://api.repo.nypl.org/api/v2/items/mods/0cabe3d0-3d50-0134-a8e0-00505686a51c",
           "imageID": "56738462",
           "itemLink": "http://digitalcollections.nypl.org/items/0cabe3d0-3d50-0134-a8e0-00505686a51c",
           "rightsStatement": "To the extent that a jurisdiction grants The New York Public Library a copyright in this item, NYPL makes this item available under a Creative Commons CC0 1.0 Universal Public Domain Dedication. Though not required, if you want to credit us as the source, please use the following statement, \"From The New York Public Library,\" and provide a link back to the item on our Digital Collections site. Doing so helps us track how our collection is used and helps justify freely releasing even more content in the future.",

diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py b/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py
@@ -80,6 +80,26 @@ def test_get_creators_failure():
     assert actual_creator is None
 
 
+@pytest.mark.parametrize("subject_container", [lambda x: [x], lambda x: x])
+@pytest.mark.parametrize("topic_container", [lambda x: [x], lambda x: x])
+@pytest.mark.parametrize(
+    "topic, expected_tags",
+    [
+        # No topics
+        [{}, []],
+        # Unrelated topics
+        [{"Unrelated": "Foo"}, []],
+        # Relevant topics
+        [{"$": "value"}, ["value"]],
+    ],
+)
+def test_get_tags(subject_container, topic_container, topic, expected_tags):
+    topics = topic_container(topic)
+    subject = subject_container({"topic": topics})
+    actual_tags = nypl._get_tags({"subject": subject})
+    assert actual_tags == expected_tags
+
+
 def test_get_metadata():
     item_response = _get_resource_json("response_itemdetails_success.json")
     mods = item_response.get("nyplAPI").get("response").get("mods")
@@ -138,12 +158,12 @@ def test_get_record_data_success():
             "date_issued": "1981",
             "genre": "Maps",
             "publisher": "New York Public Library, Local History and Genealogy Division",
-            "tags": "Census districts",
             "type_of_resource": "cartographic",
             "physical_description": "4 polyester film encapsulations, some containing 2 sheets back-to-back. "
             "Accompanying text formatted as 1 large sheet (46 x 59 cm), in one of "
             "the encapsulations.",
         },
+        "raw_tags": ["Census districts"],
         "title": "1900 census enumeration districts, Manhattan and Bronx",
         "license_info": CC0,
     }