Skip to content

Commit f3799fc

Browse files
authored
Reinstate image thumbnail column (#903)
1 parent 6f92f40 commit f3799fc

File tree

11 files changed

+44
-10
lines changed

11 files changed

+44
-10
lines changed

DAGs.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ The following are DAGs grouped by their primary tag:
7979
| [`rawpixel_workflow`](#rawpixel_workflow) | `@monthly` | `False` | image |
8080
| [`science_museum_workflow`](#science_museum_workflow) | `@monthly` | `False` | image |
8181
| [`smithsonian_workflow`](#smithsonian_workflow) | `@weekly` | `False` | image |
82-
| `smk_workflow` | `@monthly` | `False` | image |
82+
| [`smk_workflow`](#smk_workflow) | `@monthly` | `False` | image |
8383
| [`stocksnap_workflow`](#stocksnap_workflow) | `@monthly` | `False` | image |
8484
| [`wikimedia_commons_workflow`](#wikimedia_commons_workflow) | `@daily` | `True` | image, audio |
8585
| [`wordpress_workflow`](#wordpress_workflow) | `@monthly` | `False` | image |
@@ -125,6 +125,7 @@ The following is documentation associated with each DAG (where available):
125125
1. [`report_pending_reported_media`](#report_pending_reported_media)
126126
1. [`science_museum_workflow`](#science_museum_workflow)
127127
1. [`smithsonian_workflow`](#smithsonian_workflow)
128+
1. [`smk_workflow`](#smk_workflow)
128129
1. [`stocksnap_workflow`](#stocksnap_workflow)
129130
1. [`wikimedia_commons_workflow`](#wikimedia_commons_workflow)
130131
1. [`wikimedia_reingestion_workflow`](#wikimedia_reingestion_workflow)
@@ -587,6 +588,18 @@ Output: TSV file containing the images and the respective meta-data.
587588
Notes: https://api.si.edu/openaccess/api/v1.0/search
588589

589590

591+
## `smk_workflow`
592+
593+
594+
Content Provider: Statens Museum for Kunst (National Gallery of Denmark)
595+
596+
ETL Process: Use the API to identify all openly licensed media.
597+
598+
Output: TSV file containing the media metadata.
599+
600+
Notes: https://www.smk.dk/en/article/smk-api/
601+
602+
590603
## `stocksnap_workflow`
591604

592605

openverse_catalog/dags/common/storage/image.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def add_item(
4444
foreign_landing_url: str,
4545
image_url: str,
4646
license_info: LicenseInfo,
47+
thumbnail_url: str | None = None,
4748
filesize: int | None = None,
4849
filetype: str | None = None,
4950
foreign_identifier: str | None = None,
@@ -122,7 +123,7 @@ def add_item(
122123
image_data = {
123124
"foreign_landing_url": foreign_landing_url,
124125
"image_url": image_url,
125-
"thumbnail_url": None,
126+
"thumbnail_url": thumbnail_url,
126127
"filesize": filesize,
127128
"filetype": filetype,
128129
"license_info": license_info,
@@ -149,10 +150,6 @@ def _get_image(self, **kwargs) -> Image | None:
149150
image_metadata = self.clean_media_metadata(**kwargs)
150151
if image_metadata is None:
151152
return None
152-
# Set the thumbnail to None to make sure no image provider scripts
153-
# write a value, and to make testing easier by not having to provide
154-
# the value.
155-
image_metadata["thumbnail_url"] = None
156153
# Convert the `image_url` key used in ImageStore, TSV and
157154
# provider API scripts into `url` key used in db
158155
image_metadata["url"] = image_metadata.pop("image_url")
@@ -176,6 +173,7 @@ class MockImageStore(ImageStore):
176173
"""
177174

178175
NULLABLE_FIELDS = [
176+
"thumbnail_url",
179177
"filesize",
180178
"filetype",
181179
"foreign_identifier",
@@ -206,7 +204,7 @@ def __init__(
206204
self.media_buffer = []
207205

208206
def add_item(self, **kwargs):
209-
image_data = kwargs | {"thumbnail_url": None}
207+
image_data = kwargs
210208
for field in MockImageStore.NULLABLE_FIELDS:
211209
if field not in image_data:
212210
image_data[field] = None

openverse_catalog/dags/common/tsv_cleaner.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def _process_row(tsv_row):
4242
image_store.add_item(
4343
foreign_landing_url=row_image.foreign_landing_url,
4444
image_url=row_image.url,
45+
thumbnail_url=row_image.thumbnail_url,
4546
license_info=get_license_info(
4647
license_url=get_license_url(row_meta_data),
4748
license_=row_image.license_,

openverse_catalog/dags/providers/provider_api_scripts/smk.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
"""
2+
Content Provider: Statens Museum for Kunst (National Gallery of Denmark)
3+
4+
ETL Process: Use the API to identify all openly licensed media.
5+
6+
Output: TSV file containing the media metadata.
7+
8+
Notes: https://www.smk.dk/en/article/smk-api/
9+
"""
110
import logging
211

312
from common import constants
@@ -53,8 +62,6 @@ def _get_foreign_landing_url(item) -> str | None:
5362
def _get_image_url(image_iiif_id: str, image_size=2048):
5463
# For high quality IIIF-enabled images, restrict the image size to prevent
5564
# loading very large files.
56-
# TODO: consider just using the full "image_native" when adding the
57-
# "image_thumbnail".
5865
image_url = f"{image_iiif_id}/full/!{image_size},/0/default.jpg"
5966
return image_url
6067

@@ -91,13 +98,15 @@ def _get_images(item: dict) -> list:
9198
else:
9299
image_url = SmkDataIngester._get_image_url(iiif_id)
93100

101+
thumbnail_url = item.get("image_thumbnail")
94102
height = item.get("image_height")
95103
width = item.get("image_width")
96104
filesize = item.get("image_size") or item.get("size")
97105
images.append(
98106
{
99107
"id": image_id,
100108
"image_url": image_url,
109+
"thumbnail_url": thumbnail_url,
101110
"height": height,
102111
"width": width,
103112
"filesize": filesize,
@@ -114,6 +123,7 @@ def _get_images(item: dict) -> list:
114123
# 'id', so we must skip if `iiif_id` is not present.
115124
continue
116125
image_url = SmkDataIngester._get_image_url(iiif_id)
126+
thumbnail_url = alt_img.get("thumbnail")
117127
height = alt_img.get("height")
118128
width = alt_img.get("width")
119129
filesize = alt_img.get("image_size") or alt_img.get("size")
@@ -122,6 +132,7 @@ def _get_images(item: dict) -> list:
122132
{
123133
"id": iiif_id,
124134
"image_url": image_url,
135+
"thumbnail_url": thumbnail_url,
125136
"height": height,
126137
"width": width,
127138
"filesize": filesize,
@@ -157,6 +168,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None:
157168
"foreign_identifier": img.get("id"),
158169
"foreign_landing_url": self._get_foreign_landing_url(data),
159170
"image_url": img.get("image_url"),
171+
"thumbnail_url": img.get("thumbnail_url"),
160172
"license_info": license_info,
161173
"title": self._get_title(data),
162174
"creator": self._get_creator(data),

tests/dags/common/storage/test_image.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@ def mock_enrich_tags(tags):
110110
args_dict["license_"] = args_dict.get("license_info").license
111111
args_dict["license_version"] = args_dict.pop("license_info").version
112112
args_dict["url"] = args_dict.pop("image_url")
113-
args_dict["thumbnail_url"] = None
114113

115114
assert actual_image == image.Image(**args_dict)
116115

tests/dags/common/storage/test_media.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,7 @@ def test_MediaStore_get_image_gets_source(
293293
license_info=BY_LICENSE_INFO,
294294
foreign_landing_url=TEST_FOREIGN_LANDING_URL,
295295
image_url=TEST_IMAGE_URL,
296+
thumbnail_url=None,
296297
filetype=None,
297298
filesize=None,
298299
foreign_identifier=None,
@@ -350,6 +351,7 @@ def item_saver(arg):
350351
license_info=BY_LICENSE_INFO,
351352
foreign_landing_url="",
352353
image_url="",
354+
thumbnail_url=None,
353355
foreign_identifier=None,
354356
width=None,
355357
height=None,
@@ -385,6 +387,7 @@ def item_saver(arg):
385387
license_info=LicenseInfo("by", "4.0", valid_license_url, license_url),
386388
foreign_landing_url="",
387389
image_url="",
390+
thumbnail_url=None,
388391
foreign_identifier=None,
389392
width=None,
390393
height=None,

tests/dags/common/test_tsv_cleaner.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def test_clean_tsv_cleans_tsv_rows(tmpdir):
3030
call().add_item(
3131
foreign_landing_url="https://example.com/landing1",
3232
image_url="https://example.com/image1",
33+
thumbnail_url="https://example.com/thumbnail1",
3334
license_info=by_license,
3435
foreign_identifier="one",
3536
width="1000",
@@ -54,6 +55,7 @@ def test_clean_tsv_cleans_tsv_rows(tmpdir):
5455
call().add_item(
5556
foreign_landing_url="https://example.com/landing2",
5657
image_url="https://example.com/image2",
58+
thumbnail_url="https://example.com/thumbnail2",
5759
license_info=by_nc_license,
5860
foreign_identifier="two",
5961
width="1000",

tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_hq.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@
44
"height": 1059,
55
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2",
66
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2/full/!2048,/0/default.jpg",
7+
"thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/2227ms627_KKSgb6458.tif.reconstructed.tif.jp2/full/!1024,/0/default.jpg",
78
"width": 3887
89
},
910
{
1011
"filesize": 19269857,
1112
"height": 1576,
1213
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
1314
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg",
15+
"thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg",
1416
"width": 4073
1517
}
1618
]

tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_legacy.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@
44
"height": 1059,
55
"id": "1170012466_object",
66
"image_url": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg",
7+
"thumbnail_url": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg",
78
"width": 3887
89
},
910
{
1011
"filesize": 19269857,
1112
"height": 1576,
1213
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
1314
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg",
15+
"thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg",
1416
"width": 4073
1517
}
1618
]

tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_partial.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"height": 1576,
55
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
66
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg",
7+
"thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg",
78
"width": 4073
89
}
910
]

tests/dags/providers/provider_api_scripts/resources/smk/image_data_hq.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,6 @@
1717
"image_iiif_id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2",
1818
"image_iiif_info": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2/info.json",
1919
"image_size": 11784886,
20+
"image_thumbnail": "https://iip-thumb.smk.dk/iiif/jp2/2227ms627_KKSgb6458.tif.reconstructed.tif.jp2/full/!1024,/0/default.jpg",
2021
"image_width": 3887
2122
}

0 commit comments

Comments
 (0)