Skip to content

Commit

Permalink
fix: unescape CrossRef journal (#6373)
Browse files Browse the repository at this point in the history
  • Loading branch information
MillenniumFalconMechanic authored Dec 20, 2023
1 parent c7d21ed commit 6892f7b
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 9 deletions.
10 changes: 6 additions & 4 deletions backend/common/providers/crossref_provider.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import html
import logging
from datetime import datetime
from urllib.parse import urlparse
Expand Down Expand Up @@ -101,14 +102,16 @@ def fetch_metadata(self, doi: str) -> dict:
# Journal
try:
if "short-container-title" in message and message["short-container-title"]:
journal = message["short-container-title"][0]
raw_journal = message["short-container-title"][0]
elif "container-title" in message and message["container-title"]:
journal = message["container-title"][0]
raw_journal = message["container-title"][0]
elif "institution" in message:
journal = message["institution"][0]["name"]
raw_journal = message["institution"][0]["name"]
except Exception:
raise CrossrefParseException("Journal node missing") from None

journal = html.unescape(raw_journal)

# Authors
# Note: make sure that the order is preserved, as it is a relevant information
authors = message["author"]
Expand Down Expand Up @@ -138,7 +141,6 @@ def fetch_metadata(self, doi: str) -> dict:
raise CrossrefParseException("Cannot parse metadata from Crossref") from e

def fetch_preprint_published_doi(self, doi):

res = self._fetch_crossref_payload(doi)
message = res.json()["message"]
is_preprint = message.get("subtype") == "preprint"
Expand Down
9 changes: 6 additions & 3 deletions backend/layers/thirdparty/crossref_provider.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import html
import logging
from datetime import datetime
from urllib.parse import urlparse
Expand Down Expand Up @@ -107,14 +108,16 @@ def fetch_metadata(self, doi: str) -> dict:
# Journal
try:
if "short-container-title" in message and message["short-container-title"]:
journal = message["short-container-title"][0]
raw_journal = message["short-container-title"][0]
elif "container-title" in message and message["container-title"]:
journal = message["container-title"][0]
raw_journal = message["container-title"][0]
elif "institution" in message:
journal = message["institution"][0]["name"]
raw_journal = message["institution"][0]["name"]
except Exception:
raise CrossrefParseException("Journal node missing") from None

journal = html.unescape(raw_journal)

# Authors
# Note: make sure that the order is preserved, as it is a relevant information
authors = message["author"]
Expand Down
41 changes: 39 additions & 2 deletions tests/unit/backend/layers/thirdparty/test_crossref_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def test__provider_does_not_call_crossref_in_test(self, mock_get):
@patch("backend.common.providers.crossref_provider.requests.get")
@patch("backend.common.providers.crossref_provider.CorporaConfig")
def test__provider_calls_crossref_if_api_key_defined(self, mock_config, mock_get):

# Defining a mocked CorporaConfig will allow the provider to consider the `crossref_api_key`
# not None, so it will go ahead and do the mocked call.

Expand Down Expand Up @@ -75,7 +74,6 @@ def test__provider_calls_crossref_if_api_key_defined(self, mock_config, mock_get
@patch("backend.common.providers.crossref_provider.requests.get")
@patch("backend.common.providers.crossref_provider.CorporaConfig")
def test__provider_parses_authors_and_dates_correctly(self, mock_config, mock_get):

response = Response()
response.status_code = 200
response._content = str.encode(
Expand Down Expand Up @@ -138,6 +136,45 @@ def test__provider_parses_authors_and_dates_correctly(self, mock_config, mock_ge

self.assertDictEqual(expected_response, res)

@patch("backend.common.providers.crossref_provider.requests.get")
@patch("backend.common.providers.crossref_provider.CorporaConfig")
def test__provider_unescapes_journal_correctly(self, mock_config, mock_get):
response = Response()
response.status_code = 200
response._content = str.encode(
json.dumps(
{
"status": "ok",
"message": {
"author": [
{"name": "A consortium"},
],
"published-online": {"date-parts": [[2021, 11]]},
"container-title": ["Clinical & Translational Med"],
},
}
)
)

mock_get.return_value = response
provider = CrossrefProvider()
res = provider.fetch_metadata("test_doi")
mock_get.assert_called_once()

expected_response = {
"authors": [
{"name": "A consortium"},
],
"published_year": 2021,
"published_month": 11,
"published_day": 1,
"published_at": 1635724800.0,
"journal": "Clinical & Translational Med",
"is_preprint": False,
}

self.assertDictEqual(expected_response, res)

@patch("backend.common.providers.crossref_provider.requests.get")
@patch("backend.common.providers.crossref_provider.CorporaConfig")
def test__provider_throws_exception_if_request_fails(self, mock_config, mock_get):
Expand Down

0 comments on commit 6892f7b

Please sign in to comment.