diff --git a/backend/common/providers/crossref_provider.py b/backend/common/providers/crossref_provider.py index 421f221af5130..8fdea7ebdbafa 100644 --- a/backend/common/providers/crossref_provider.py +++ b/backend/common/providers/crossref_provider.py @@ -1,3 +1,4 @@ +import html import logging from datetime import datetime from urllib.parse import urlparse @@ -101,14 +102,16 @@ def fetch_metadata(self, doi: str) -> dict: # Journal try: if "short-container-title" in message and message["short-container-title"]: - journal = message["short-container-title"][0] + raw_journal = message["short-container-title"][0] elif "container-title" in message and message["container-title"]: - journal = message["container-title"][0] + raw_journal = message["container-title"][0] elif "institution" in message: - journal = message["institution"][0]["name"] + raw_journal = message["institution"][0]["name"] except Exception: raise CrossrefParseException("Journal node missing") from None + journal = html.unescape(raw_journal) + # Authors # Note: make sure that the order is preserved, as it is a relevant information authors = message["author"] @@ -138,7 +141,6 @@ def fetch_metadata(self, doi: str) -> dict: raise CrossrefParseException("Cannot parse metadata from Crossref") from e def fetch_preprint_published_doi(self, doi): - res = self._fetch_crossref_payload(doi) message = res.json()["message"] is_preprint = message.get("subtype") == "preprint" diff --git a/backend/layers/thirdparty/crossref_provider.py b/backend/layers/thirdparty/crossref_provider.py index 8a7a80d7946a1..3f97ce70cbf1b 100644 --- a/backend/layers/thirdparty/crossref_provider.py +++ b/backend/layers/thirdparty/crossref_provider.py @@ -1,3 +1,4 @@ +import html import logging from datetime import datetime from urllib.parse import urlparse @@ -107,14 +108,16 @@ def fetch_metadata(self, doi: str) -> dict: # Journal try: if "short-container-title" in message and message["short-container-title"]: - journal = message["short-container-title"][0] + raw_journal = message["short-container-title"][0] elif "container-title" in message and message["container-title"]: - journal = message["container-title"][0] + raw_journal = message["container-title"][0] elif "institution" in message: - journal = message["institution"][0]["name"] + raw_journal = message["institution"][0]["name"] except Exception: raise CrossrefParseException("Journal node missing") from None + journal = html.unescape(raw_journal) + # Authors # Note: make sure that the order is preserved, as it is a relevant information authors = message["author"] diff --git a/tests/unit/backend/layers/thirdparty/test_crossref_provider.py b/tests/unit/backend/layers/thirdparty/test_crossref_provider.py index 29644cb6caa54..4567001bac1b9 100644 --- a/tests/unit/backend/layers/thirdparty/test_crossref_provider.py +++ b/tests/unit/backend/layers/thirdparty/test_crossref_provider.py @@ -25,7 +25,6 @@ def test__provider_does_not_call_crossref_in_test(self, mock_get): @patch("backend.common.providers.crossref_provider.requests.get") @patch("backend.common.providers.crossref_provider.CorporaConfig") def test__provider_calls_crossref_if_api_key_defined(self, mock_config, mock_get): - # Defining a mocked CorporaConfig will allow the provider to consider the `crossref_api_key` # not None, so it will go ahead and do the mocked call. @@ -75,7 +74,6 @@ def test__provider_calls_crossref_if_api_key_defined(self, mock_config, mock_get @patch("backend.common.providers.crossref_provider.requests.get") @patch("backend.common.providers.crossref_provider.CorporaConfig") def test__provider_parses_authors_and_dates_correctly(self, mock_config, mock_get): - response = Response() response.status_code = 200 response._content = str.encode( @@ -138,6 +136,45 @@ def test__provider_parses_authors_and_dates_correctly(self, mock_config, mock_ge self.assertDictEqual(expected_response, res) + @patch("backend.common.providers.crossref_provider.requests.get") + @patch("backend.common.providers.crossref_provider.CorporaConfig") + def test__provider_unescapes_journal_correctly(self, mock_config, mock_get): + response = Response() + response.status_code = 200 + response._content = str.encode( + json.dumps( + { + "status": "ok", + "message": { + "author": [ + {"name": "A consortium"}, + ], + "published-online": {"date-parts": [[2021, 11]]}, + "container-title": ["Clinical & Translational Med"], + }, + } + ) + ) + + mock_get.return_value = response + provider = CrossrefProvider() + res = provider.fetch_metadata("test_doi") + mock_get.assert_called_once() + + expected_response = { + "authors": [ + {"name": "A consortium"}, + ], + "published_year": 2021, + "published_month": 11, + "published_day": 1, + "published_at": 1635724800.0, + "journal": "Clinical & Translational Med", + "is_preprint": False, + } + + self.assertDictEqual(expected_response, res) + @patch("backend.common.providers.crossref_provider.requests.get") @patch("backend.common.providers.crossref_provider.CorporaConfig") def test__provider_throws_exception_if_request_fails(self, mock_config, mock_get):