diff --git a/catalog/dags/common/loader/provider_details.py b/catalog/dags/common/loader/provider_details.py index 3b81d8d2662..6e892a62e2a 100644 --- a/catalog/dags/common/loader/provider_details.py +++ b/catalog/dags/common/loader/provider_details.py @@ -124,7 +124,13 @@ # User-Agent header for APIs that require it CONTACT_EMAIL = os.getenv("CONTACT_EMAIL") -UA_STRING = f"Openverse/0.1 (https://wordpress.org/openverse; {CONTACT_EMAIL})" + +CANONICAL_DOMAIN: str = os.getenv("CANONICAL_DOMAIN", "openverse.org") + +_proto = "http" if "localhost" in CANONICAL_DOMAIN else "https" +CANONICAL_ORIGIN: str = f"{_proto}://{CANONICAL_DOMAIN}" + +UA_STRING = f"Openverse/0.1 ({CANONICAL_ORIGIN}; {CONTACT_EMAIL})" # Available Image Categories for API diff --git a/catalog/dags/common/requester.py b/catalog/dags/common/requester.py index c417de28738..1b91c3a7cae 100644 --- a/catalog/dags/common/requester.py +++ b/catalog/dags/common/requester.py @@ -7,6 +7,7 @@ from requests.exceptions import JSONDecodeError import oauth2 +from common.loader import provider_details as prov # pytest_socket will not be available in production, so we must create a shim for @@ -42,12 +43,13 @@ class DelayedRequester: delay: an integer giving the minimum number of seconds to wait between consecutive requests via the `get` method. headers: a dict that will be passed in all requests, unless overridden - by kwargs in specific calls to the get method + by kwargs in specific calls to the `get` method """ - def __init__(self, delay=0, headers=None): + def __init__(self, delay: int = 0, headers: dict | None = None): + headers = {} if headers is None else headers self._DELAY = delay - self.headers = headers or {} + self.headers = {"User-Agent": prov.UA_STRING} | headers self._last_request = 0 self.session = requests.Session() diff --git a/catalog/dags/providers/provider_api_scripts/museum_victoria.py b/catalog/dags/providers/provider_api_scripts/museum_victoria.py index aa6170cecb1..7198a1e8b12 100644 --- a/catalog/dags/providers/provider_api_scripts/museum_victoria.py +++ b/catalog/dags/providers/provider_api_scripts/museum_victoria.py @@ -25,7 +25,7 @@ class ImageDetails(TypedDict, total=False): class VictoriaDataIngester(ProviderDataIngester): providers = {"image": prov.VICTORIA_DEFAULT_PROVIDER} endpoint = "https://collections.museumsvictoria.com.au/api/search" - headers = {"User-Agent": prov.UA_STRING, "Accept": "application/json"} + headers = {"Accept": "application/json"} batch_limit = 100 delay = 5 LANDING_PAGE = "https://collections.museumsvictoria.com.au/" diff --git a/catalog/dags/providers/provider_api_scripts/nappy.py b/catalog/dags/providers/provider_api_scripts/nappy.py index 7f2a62b2eb8..f9078b8ba90 100644 --- a/catalog/dags/providers/provider_api_scripts/nappy.py +++ b/catalog/dags/providers/provider_api_scripts/nappy.py @@ -24,7 +24,7 @@ class NappyDataIngester(ProviderDataIngester): providers = {constants.IMAGE: prov.NAPPY_DEFAULT_PROVIDER} endpoint = "https://api.nappy.co/v1/openverse/images" - headers = {"User-Agent": prov.UA_STRING, "Accept": "application/json"} + headers = {"Accept": "application/json"} # Hardcoded to CC0, the only license Nappy.co uses license_info = get_license_info( diff --git a/catalog/dags/providers/provider_api_scripts/provider_data_ingester.py b/catalog/dags/providers/provider_api_scripts/provider_data_ingester.py index 5c6c0032a30..982dc4bc91a 100644 --- a/catalog/dags/providers/provider_api_scripts/provider_data_ingester.py +++ b/catalog/dags/providers/provider_api_scripts/provider_data_ingester.py @@ -8,6 +8,7 @@ from airflow.exceptions import AirflowException from airflow.models import Variable +from common.loader import provider_details as prov from common.requester import DelayedRequester from common.storage.media import MediaStore from common.storage.util import get_media_store_class @@ -145,6 +146,9 @@ def __init__( # Keep track of number of records ingested self.record_count = 0 + # Set default headers + self.headers = {"User-Agent": prov.UA_STRING} | self.headers + # Initialize the DelayedRequester and all necessary Media Stores. self.delayed_requester = DelayedRequester( delay=self.delay, headers=self.headers diff --git a/catalog/dags/providers/provider_api_scripts/rawpixel.py b/catalog/dags/providers/provider_api_scripts/rawpixel.py index c731547c70d..bbd8d657048 100644 --- a/catalog/dags/providers/provider_api_scripts/rawpixel.py +++ b/catalog/dags/providers/provider_api_scripts/rawpixel.py @@ -89,7 +89,6 @@ class RawpixelDataIngester(ProviderDataIngester): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.api_key: str = Variable.get("API_KEY_RAWPIXEL") - self.headers = {"User-Agent": prov.UA_STRING} def get_media_type(self, record: dict) -> str: return constants.IMAGE diff --git a/catalog/dags/providers/provider_api_scripts/stocksnap.py b/catalog/dags/providers/provider_api_scripts/stocksnap.py index e2ae6c77404..e7d2f558d20 100644 --- a/catalog/dags/providers/provider_api_scripts/stocksnap.py +++ b/catalog/dags/providers/provider_api_scripts/stocksnap.py @@ -31,10 +31,7 @@ class StockSnapDataIngester(ProviderDataIngester): providers = {"image": prov.STOCKSNAP_DEFAULT_PROVIDER} batch_limit = 1000 delay = 1 # in seconds - headers = { - "Accept": "application/json", - "User-Agent": prov.UA_STRING, - } + headers = {"Accept": "application/json"} license_url = "https://creativecommons.org/publicdomain/zero/1.0/" license_info = get_license_info(license_url=license_url) diff --git a/catalog/dags/providers/provider_api_scripts/wikimedia_commons.py b/catalog/dags/providers/provider_api_scripts/wikimedia_commons.py index 0b6cb659f3c..2500e8bd266 100644 --- a/catalog/dags/providers/provider_api_scripts/wikimedia_commons.py +++ b/catalog/dags/providers/provider_api_scripts/wikimedia_commons.py @@ -130,7 +130,6 @@ class WikimediaCommonsDataIngester(ProviderDataIngester): } host = "commons.wikimedia.org" endpoint = f"https://{host}/w/api.php" - headers = {"User-Agent": prov.UA_STRING} # The 10000 is a bit arbitrary, but needs to be larger than the mean # number of uses per file (globally) in the response_json, or we will diff --git a/catalog/tests/dags/common/test_requester.py b/catalog/tests/dags/common/test_requester.py index 2f43658234c..6ca32baae1f 100644 --- a/catalog/tests/dags/common/test_requester.py +++ b/catalog/tests/dags/common/test_requester.py @@ -8,6 +8,10 @@ from catalog.tests.dags.conftest import FAKE_OAUTH_PROVIDER_NAME from common import requester +from common.loader import provider_details as prov + + +USER_AGENT = {"User-Agent": prov.UA_STRING} @patch("common.requester.time") @@ -155,8 +159,12 @@ def test_oauth_requester_initializes_correctly(oauth_provider_var_mock): @pytest.mark.parametrize( "init_headers, request_kwargs, expected_request_kwargs", [ - (None, None, {"headers": {}}), - ({"init_header": "test"}, None, {"headers": {"init_header": "test"}}), + (None, None, {"headers": USER_AGENT}), + ( + {"init_header": "test"}, + None, + {"headers": {"init_header": "test"} | USER_AGENT}, + ), ( None, {"headers": {"h1": "test1"}, "other_kwarg": "test"}, diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_provider_data_ingester.py b/catalog/tests/dags/providers/provider_api_scripts/test_provider_data_ingester.py index 070726c4230..67844865fe6 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_provider_data_ingester.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_provider_data_ingester.py @@ -17,6 +17,7 @@ MockImageOnlyProviderDataIngester, MockProviderDataIngester, ) +from common.loader import provider_details as prov from common.requester import RetriesExceeded from common.storage.audio import AudioStore, MockAudioStore from common.storage.image import ImageStore, MockImageStore @@ -92,6 +93,15 @@ def test_get_response_json(endpoint, expected): assert actual_endpoint == expected +def test_passes_user_agent_header(): + ingester = MockProviderDataIngester() + with patch.object(ingester.delayed_requester, "get_response_json") as mock_get: + ingester.get_response_json({}) + actual_headers = mock_get.call_args.kwargs["headers"] + assert "User-Agent" in actual_headers + assert actual_headers["User-Agent"] == prov.UA_STRING + + def test_batch_limit_is_capped_to_ingestion_limit(): with patch( "providers.provider_api_scripts.provider_data_ingester.Variable"