Skip to content

Commit

Permalink
feat: fetch published doi if preprint (#6311)
Browse files Browse the repository at this point in the history
- enforce https://doi.org/ domain prefix
- use preprint doi if published doi cannot be retrieved
  • Loading branch information
Daniel Hegeman authored Jan 6, 2024
1 parent 58d4708 commit 59c3b49
Show file tree
Hide file tree
Showing 9 changed files with 255 additions and 85 deletions.
37 changes: 24 additions & 13 deletions backend/layers/business/business.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
)
from backend.layers.common import validation
from backend.layers.common.cleanup import sanitize
from backend.layers.common.doi import doi_curie_from_link
from backend.layers.common.entities import (
CanonicalCollection,
CollectionId,
Expand Down Expand Up @@ -162,7 +163,7 @@ def trigger_dataset_artifact_update(
current_dataset_version_id, new_dataset_version_id, metadata_update
)

def _get_publisher_metadata(self, doi: str, errors: list) -> Optional[dict]:
def _get_publisher_metadata(self, doi: str, errors: list) -> Tuple[Optional[dict], Optional[str]]:
"""
Retrieves publisher metadata from Crossref.
"""
Expand All @@ -172,7 +173,7 @@ def _get_publisher_metadata(self, doi: str, errors: list) -> Optional[dict]:
errors.append({"link_type": CollectionLinkType.DOI, "reason": "DOI cannot be found on Crossref"})
except CrossrefException as e:
logging.warning(f"CrossrefException on create_collection: {e}. Will ignore metadata.")
return None
return None, None

def create_collection(
self, owner: str, curator_name: str, collection_metadata: CollectionMetadata
Expand All @@ -189,9 +190,14 @@ def create_collection(
validation.verify_collection_metadata(collection_metadata, errors)

# TODO: Maybe switch link.type to be an enum
doi = next((link.uri for link in collection_metadata.links if link.type == "DOI"), None)
doi_link = next((link for link in collection_metadata.links if link.type == "DOI"), None)

publisher_metadata = self._get_publisher_metadata(doi, errors) if doi is not None else None
publisher_metadata = None
if doi_link:
publisher_metadata, doi_curie_from_crossref = self._get_publisher_metadata(doi_link.uri, errors)
# Ensure DOI link has correct hyperlink formation a la https://doi.org/{curie_identifier}
# DOI returned from Crossref may be a different (published) DOI altogether if submitted DOI is preprint
doi_link.uri = f"https://doi.org/{doi_curie_from_crossref}"

if errors:
raise CollectionCreationException(errors)
Expand Down Expand Up @@ -357,15 +363,18 @@ def update_collection_version(

new_doi = None
current_doi = None
if apply_doi_update:
if apply_doi_update and body.links is not None: # empty list is used to reset DOI
# Determine if the DOI has changed
current_doi = next(
(link.uri for link in current_collection_version.metadata.links if link.type == "DOI"), None
)
new_doi = (
None if body.links is None else next((link.uri for link in body.links if link.type == "DOI"), None)
)

# Format new doi link correctly; current link will have format https://doi.org/{curie_identifier}
if new_doi_curie := doi_curie_from_link(
next((link.uri for link in body.links if link.type == "DOI"), None)
):
new_doi = f"https://doi.org/{new_doi_curie}"
else:
next((link.uri for link in body.links if link.type == "DOI"), None)
if current_doi != new_doi:
for dataset in current_collection_version.datasets:
# Avoid reprocessing a dataset while it is already processing to avoid race conditions
Expand All @@ -382,13 +391,16 @@ def update_collection_version(
}
)
break

elif doi_link := next((link for link in body.links if link.type == "DOI"), None):
doi_link.uri = new_doi # Ensures we submit DOI link in correct format
if current_doi and new_doi is None:
# If the DOI was deleted, remove the publisher_metadata field
unset_publisher_metadata = True
elif (new_doi is not None) and new_doi != current_doi:
elif new_doi and new_doi != current_doi:
# If the DOI has changed, fetch and update the metadata
publisher_metadata_to_set = self._get_publisher_metadata(new_doi, errors)
publisher_metadata_to_set, doi_curie_from_crossref = self._get_publisher_metadata(new_doi, errors)
new_doi = f"https://doi.org/{doi_curie_from_crossref}"
next((link for link in body.links if link.type == "DOI")).uri = new_doi # noqa - DOI link exists

if errors:
raise CollectionUpdateException(errors)
Expand All @@ -409,7 +421,6 @@ def update_collection_version(
elif publisher_metadata_to_set is not None:
self.database_provider.save_collection_publisher_metadata(version_id, publisher_metadata_to_set)
self.database_provider.save_collection_metadata(version_id, new_metadata)

if all(
[apply_doi_update, new_doi != current_doi, FeatureFlagService.is_enabled(FeatureFlagValues.CITATION_UPDATE)]
):
Expand Down
3 changes: 3 additions & 0 deletions backend/layers/business/business_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ def get_collection_version_from_canonical(
) -> Optional[CollectionVersionWithDatasets]:
pass

def _get_publisher_metadata(self, doi: str, errors: list) -> Tuple[Optional[dict], Optional[str]]:
pass

def create_collection(
self, owner: str, curator_name: str, collection_metadata: CollectionMetadata
) -> CollectionVersion:
Expand Down
10 changes: 9 additions & 1 deletion backend/layers/common/doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ def curation_get_normalized_doi_url(doi: str, errors: list) -> Optional[str]:
return f"https://doi.org/{doi}"


def doi_curie_from_link(doi: str) -> str:
# Remove the https://doi.org/ (or other) domain part
parsed = urlparse(doi)
if parsed.scheme and parsed.netloc:
doi = parsed.path.lstrip("/")
return doi


def portal_get_normalized_doi_url(doi_node: dict, errors: list) -> Optional[str]:
"""
1. Check for DOI uniqueness in the payload
Expand All @@ -37,7 +45,7 @@ def portal_get_normalized_doi_url(doi_node: dict, errors: list) -> Optional[str]
doi_url = doi_node["link_url"]
parsed = urlparse(doi_url)
if not parsed.scheme and not parsed.netloc:
parsed_doi = parsed.path
parsed_doi = parsed.path.lstrip("/")
if not DOI_REGEX_COMPILED.match(parsed_doi):
errors.append({"link_type": "DOI", "reason": "Invalid DOI"})
return None
Expand Down
72 changes: 36 additions & 36 deletions backend/layers/thirdparty/crossref_provider.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import html
import logging
from datetime import datetime
from urllib.parse import urlparse
from typing import Optional, Tuple

import requests

from backend.common.corpora_config import CorporaConfig
from backend.layers.common.doi import doi_curie_from_link


class CrossrefProviderInterface:
def fetch_metadata(self, doi: str) -> dict:
pass
def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]:
return None, None

def fetch_preprint_published_doi(self, doi):
pass
Expand Down Expand Up @@ -54,10 +55,6 @@ def parse_date_parts(obj):
return (year, month, day)

def _fetch_crossref_payload(self, doi):
# Remove the https://doi.org part
parsed = urlparse(doi)
if parsed.scheme and parsed.netloc:
doi = parsed.path

if self.crossref_api_key is None:
logging.info("No Crossref API key found, skipping metadata fetching.")
Expand All @@ -77,16 +74,19 @@ def _fetch_crossref_payload(self, doi):

return res

def fetch_metadata(self, doi: str) -> dict:
def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]:
"""
Fetches and extracts publisher metadata from Crossref for a specified DOI.
If the Crossref API URI isn't in the configuration, we will just return an empty object.
This is to avoid calling Crossref in non-production environments.
:param doi: str - DOI uri link or curie identifier
return: tuple - publisher metadata dict and DOI curie identifier
"""
doi_curie = doi_curie_from_link(doi)

res = self._fetch_crossref_payload(doi)
res = self._fetch_crossref_payload(doi_curie)
if not res:
return
return None, None

try:
message = res.json()["message"]
Expand Down Expand Up @@ -135,32 +135,32 @@ def fetch_metadata(self, doi: str) -> dict:

# Preprint
is_preprint = message.get("subtype") == "preprint"

return {
"authors": parsed_authors,
"published_year": published_year,
"published_month": published_month,
"published_day": published_day,
"published_at": datetime.timestamp(datetime(published_year, published_month, published_day)),
"journal": journal,
"is_preprint": is_preprint,
}
if is_preprint:
published_metadata, published_doi_curie = self.fetch_published_metadata(message)
if published_metadata and published_doi_curie: # if not, use preprint doi curie
return published_metadata, published_doi_curie

return (
{
"authors": parsed_authors,
"published_year": published_year,
"published_month": published_month,
"published_day": published_day,
"published_at": datetime.timestamp(datetime(published_year, published_month, published_day)),
"journal": journal,
"is_preprint": is_preprint,
},
doi_curie,
)
except Exception as e:
raise CrossrefParseException("Cannot parse metadata from Crossref") from e

def fetch_preprint_published_doi(self, doi):
"""
Given a preprint DOI, returns the DOI of the published paper, if available.
"""

res = self._fetch_crossref_payload(doi)
message = res.json()["message"]
is_preprint = message.get("subtype") == "preprint"

if is_preprint:
try:
published_doi = message["relation"]["is-preprint-of"]
if published_doi[0]["id-type"] == "doi":
return published_doi[0]["id"]
except Exception:
pass
def fetch_published_metadata(self, doi_response_message: dict) -> Tuple[Optional[dict], Optional[str]]:
try:
published_doi = doi_response_message["relation"]["is-preprint-of"]
# the new DOI to query for ...
for entity in published_doi:
if entity["id-type"] == "doi":
return self.fetch_metadata(entity["id"])
except Exception: # if fetch of published doi errors out, just use preprint doi
return None, None
29 changes: 26 additions & 3 deletions tests/unit/backend/layers/api/test_curation_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1297,10 +1297,12 @@ def test__update_collection__doi__OK(self):
{"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"},
]
new_doi = "10.1016" # a real DOI (CURIE reference)
self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020"))
collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id
original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json
self.assertEqual(initial_doi, original_collection["doi"])
metadata = {"doi": new_doi}
self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016"))
response = self.app.patch(
f"/curation/v1/collections/{collection_id}",
json=metadata,
Expand All @@ -1315,6 +1317,7 @@ def test__update_collection__consortia__OK(self):
links = [
{"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"},
]
self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020"))
collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id
original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json
self.assertEqual(initial_consortia, original_collection["consortia"])
Expand All @@ -1333,6 +1336,7 @@ def test__remove_collection__consortia__OK(self):
links = [
{"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"},
]
self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020"))
collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id
original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json
self.assertEqual(initial_consortia, original_collection["consortia"])
Expand All @@ -1351,6 +1355,7 @@ def test__update_public_collection_verify_fix_consortia_sort_order_OK(self):
links = [
{"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"},
]
self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020"))
collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id
original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json
self.assertEqual(initial_consortia, original_collection["consortia"])
Expand All @@ -1367,6 +1372,9 @@ def test__update_collection__doi_is_not_CURIE_reference__BAD_REQUEST(self):
links = [
{"link_name": "doi", "link_type": "DOI", "link_url": "http://doi.doi/10.1011/something"},
]
self.crossref_provider.fetch_metadata = Mock(
return_value=(generate_mock_publisher_metadata(), "10.1011/something")
)
collection = self.generate_collection(links=links, visibility="PRIVATE")
collection_id = collection.collection_id
original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json
Expand All @@ -1385,9 +1393,8 @@ def test__update_collection__links_None_does_not_remove_publisher_metadata(self)
links = [
{"link_name": "doi", "link_type": "DOI", "link_url": "http://doi.doi/10.1011/something"},
]

mock_publisher_metadata = generate_mock_publisher_metadata()
self.crossref_provider.fetch_metadata = Mock(return_value=mock_publisher_metadata)
self.crossref_provider.fetch_metadata = Mock(return_value=(mock_publisher_metadata, "10.1011/something"))

collection = self.generate_collection(links=links, visibility="PRIVATE")
collection_id = collection.collection_id
Expand All @@ -1414,7 +1421,8 @@ def test__update_collection__doi_does_not_exist__BAD_REQUEST(self):
{"link_name": "new link", "link_type": "RAW_DATA", "link_url": "http://brand_new_link.place"},
]
mock_publisher_metadata = generate_mock_publisher_metadata()
self.crossref_provider.fetch_metadata = Mock(return_value=mock_publisher_metadata)
self.crossref_provider.fetch_metadata = Mock(return_value=(mock_publisher_metadata, "10.1011/something"))

collection = self.generate_collection(links=links, visibility="PRIVATE")
self.assertIsNotNone(collection.publisher_metadata)
collection_id = collection.collection_id
Expand Down Expand Up @@ -1780,6 +1788,9 @@ def test_get_dataset_no_assets(self):
self.assertEqual([], body["assets"])

def test_get_all_datasets_200(self):
self.crossref_provider.fetch_metadata = Mock(
return_value=(generate_mock_publisher_metadata(), "12.3456/j.celrep")
)
published_collection_1 = self.generate_published_collection(
add_datasets=2,
metadata=CollectionMetadata(
Expand All @@ -1791,6 +1802,9 @@ def test_get_all_datasets_200(self):
["Consortia 1", "Consortia 2"],
),
)
self.crossref_provider.fetch_metadata = Mock(
return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep")
)
published_collection_2 = self.generate_published_collection(
owner="other owner",
curator_name="other curator",
Expand Down Expand Up @@ -1880,6 +1894,9 @@ def test_get_all_datasets_200(self):
self.assertEqual(expected_assets, dataset["assets"])

def test_get_datasets_by_schema_200(self):
self.crossref_provider.fetch_metadata = Mock(
return_value=(generate_mock_publisher_metadata(), "12.3456/j.celrep")
)
published_collection_1 = self.generate_published_collection(
add_datasets=2,
metadata=CollectionMetadata(
Expand All @@ -1891,6 +1908,9 @@ def test_get_datasets_by_schema_200(self):
["Consortia 1", "Consortia 2"],
),
)
self.crossref_provider.fetch_metadata = Mock(
return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep")
)
published_collection_2 = self.generate_published_collection(
owner="other owner",
curator_name="other curator",
Expand All @@ -1905,6 +1925,9 @@ def test_get_datasets_by_schema_200(self):
),
dataset_schema_version="3.1.0",
)
self.crossref_provider.fetch_metadata = Mock(
return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep")
)
published_collection_3 = self.generate_published_collection(
owner="other owner",
curator_name="other curator",
Expand Down
Loading

0 comments on commit 59c3b49

Please sign in to comment.