Skip to content

Commit 6636dcf

Browse files
authored
Make User-Agent a default header in the Catalog (#3828)
* Add User-Agent as default header to DelayedRequester * Remove User-Agent header form provider DAGs * Test User-Agent in ProviderDataIngester class * Set User-Agent header in ProviderDataIngester * Update User-Agent string with CANONICAL_ORIGIN * Add CANONICAL_DOMAIN variable and derive CANONICAL_ORIGIN from it
1 parent b873150 commit 6636dcf

File tree

10 files changed

+39
-14
lines changed

10 files changed

+39
-14
lines changed

catalog/dags/common/loader/provider_details.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,13 @@
124124

125125
# User-Agent header for APIs that require it
126126
CONTACT_EMAIL = os.getenv("CONTACT_EMAIL")
127-
UA_STRING = f"Openverse/0.1 (https://wordpress.org/openverse; {CONTACT_EMAIL})"
127+
128+
CANONICAL_DOMAIN: str = os.getenv("CANONICAL_DOMAIN", "openverse.org")
129+
130+
_proto = "http" if "localhost" in CANONICAL_DOMAIN else "https"
131+
CANONICAL_ORIGIN: str = f"{_proto}://{CANONICAL_DOMAIN}"
132+
133+
UA_STRING = f"Openverse/0.1 ({CANONICAL_ORIGIN}; {CONTACT_EMAIL})"
128134

129135

130136
# Available Image Categories for API

catalog/dags/common/requester.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from requests.exceptions import JSONDecodeError
88

99
import oauth2
10+
from common.loader import provider_details as prov
1011

1112

1213
# pytest_socket will not be available in production, so we must create a shim for
@@ -42,12 +43,13 @@ class DelayedRequester:
4243
delay: an integer giving the minimum number of seconds to wait
4344
between consecutive requests via the `get` method.
4445
headers: a dict that will be passed in all requests, unless overridden
45-
by kwargs in specific calls to the get method
46+
by kwargs in specific calls to the `get` method
4647
"""
4748

48-
def __init__(self, delay=0, headers=None):
49+
def __init__(self, delay: int = 0, headers: dict | None = None):
50+
headers = {} if headers is None else headers
4951
self._DELAY = delay
50-
self.headers = headers or {}
52+
self.headers = {"User-Agent": prov.UA_STRING} | headers
5153
self._last_request = 0
5254
self.session = requests.Session()
5355

catalog/dags/providers/provider_api_scripts/museum_victoria.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class ImageDetails(TypedDict, total=False):
2525
class VictoriaDataIngester(ProviderDataIngester):
2626
providers = {"image": prov.VICTORIA_DEFAULT_PROVIDER}
2727
endpoint = "https://collections.museumsvictoria.com.au/api/search"
28-
headers = {"User-Agent": prov.UA_STRING, "Accept": "application/json"}
28+
headers = {"Accept": "application/json"}
2929
batch_limit = 100
3030
delay = 5
3131
LANDING_PAGE = "https://collections.museumsvictoria.com.au/"

catalog/dags/providers/provider_api_scripts/nappy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
class NappyDataIngester(ProviderDataIngester):
2525
providers = {constants.IMAGE: prov.NAPPY_DEFAULT_PROVIDER}
2626
endpoint = "https://api.nappy.co/v1/openverse/images"
27-
headers = {"User-Agent": prov.UA_STRING, "Accept": "application/json"}
27+
headers = {"Accept": "application/json"}
2828

2929
# Hardcoded to CC0, the only license Nappy.co uses
3030
license_info = get_license_info(

catalog/dags/providers/provider_api_scripts/provider_data_ingester.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from airflow.exceptions import AirflowException
99
from airflow.models import Variable
1010

11+
from common.loader import provider_details as prov
1112
from common.requester import DelayedRequester
1213
from common.storage.media import MediaStore
1314
from common.storage.util import get_media_store_class
@@ -145,6 +146,9 @@ def __init__(
145146
# Keep track of number of records ingested
146147
self.record_count = 0
147148

149+
# Set default headers
150+
self.headers = {"User-Agent": prov.UA_STRING} | self.headers
151+
148152
# Initialize the DelayedRequester and all necessary Media Stores.
149153
self.delayed_requester = DelayedRequester(
150154
delay=self.delay, headers=self.headers

catalog/dags/providers/provider_api_scripts/rawpixel.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ class RawpixelDataIngester(ProviderDataIngester):
8989
def __init__(self, *args, **kwargs):
9090
super().__init__(*args, **kwargs)
9191
self.api_key: str = Variable.get("API_KEY_RAWPIXEL")
92-
self.headers = {"User-Agent": prov.UA_STRING}
9392

9493
def get_media_type(self, record: dict) -> str:
9594
return constants.IMAGE

catalog/dags/providers/provider_api_scripts/stocksnap.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,7 @@ class StockSnapDataIngester(ProviderDataIngester):
3131
providers = {"image": prov.STOCKSNAP_DEFAULT_PROVIDER}
3232
batch_limit = 1000
3333
delay = 1 # in seconds
34-
headers = {
35-
"Accept": "application/json",
36-
"User-Agent": prov.UA_STRING,
37-
}
34+
headers = {"Accept": "application/json"}
3835
license_url = "https://creativecommons.org/publicdomain/zero/1.0/"
3936
license_info = get_license_info(license_url=license_url)
4037

catalog/dags/providers/provider_api_scripts/wikimedia_commons.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ class WikimediaCommonsDataIngester(ProviderDataIngester):
130130
}
131131
host = "commons.wikimedia.org"
132132
endpoint = f"https://{host}/w/api.php"
133-
headers = {"User-Agent": prov.UA_STRING}
134133

135134
# The 10000 is a bit arbitrary, but needs to be larger than the mean
136135
# number of uses per file (globally) in the response_json, or we will

catalog/tests/dags/common/test_requester.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88

99
from catalog.tests.dags.conftest import FAKE_OAUTH_PROVIDER_NAME
1010
from common import requester
11+
from common.loader import provider_details as prov
12+
13+
14+
USER_AGENT = {"User-Agent": prov.UA_STRING}
1115

1216

1317
@patch("common.requester.time")
@@ -155,8 +159,12 @@ def test_oauth_requester_initializes_correctly(oauth_provider_var_mock):
155159
@pytest.mark.parametrize(
156160
"init_headers, request_kwargs, expected_request_kwargs",
157161
[
158-
(None, None, {"headers": {}}),
159-
({"init_header": "test"}, None, {"headers": {"init_header": "test"}}),
162+
(None, None, {"headers": USER_AGENT}),
163+
(
164+
{"init_header": "test"},
165+
None,
166+
{"headers": {"init_header": "test"} | USER_AGENT},
167+
),
160168
(
161169
None,
162170
{"headers": {"h1": "test1"}, "other_kwarg": "test"},

catalog/tests/dags/providers/provider_api_scripts/test_provider_data_ingester.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
MockImageOnlyProviderDataIngester,
1818
MockProviderDataIngester,
1919
)
20+
from common.loader import provider_details as prov
2021
from common.requester import RetriesExceeded
2122
from common.storage.audio import AudioStore, MockAudioStore
2223
from common.storage.image import ImageStore, MockImageStore
@@ -92,6 +93,15 @@ def test_get_response_json(endpoint, expected):
9293
assert actual_endpoint == expected
9394

9495

96+
def test_passes_user_agent_header():
97+
ingester = MockProviderDataIngester()
98+
with patch.object(ingester.delayed_requester, "get_response_json") as mock_get:
99+
ingester.get_response_json({})
100+
actual_headers = mock_get.call_args.kwargs["headers"]
101+
assert "User-Agent" in actual_headers
102+
assert actual_headers["User-Agent"] == prov.UA_STRING
103+
104+
95105
def test_batch_limit_is_capped_to_ingestion_limit():
96106
with patch(
97107
"providers.provider_api_scripts.provider_data_ingester.Variable"

0 commit comments

Comments
 (0)