feat: fetch published doi if preprint (#6311)

- enforce https://doi.org/ domain prefix - use preprint doi if published doi cannot be retrieved
chanzuckerberg · Jan 6, 2024 · 59c3b49 · 59c3b49
1 parent 58d4708
commit 59c3b49
Show file tree

Hide file tree

Showing 9 changed files with 255 additions and 85 deletions.
diff --git a/backend/layers/business/business.py b/backend/layers/business/business.py
@@ -37,6 +37,7 @@
 )
 from backend.layers.common import validation
 from backend.layers.common.cleanup import sanitize
+from backend.layers.common.doi import doi_curie_from_link
 from backend.layers.common.entities import (
     CanonicalCollection,
     CollectionId,
@@ -162,7 +163,7 @@ def trigger_dataset_artifact_update(
             current_dataset_version_id, new_dataset_version_id, metadata_update
         )
 
-    def _get_publisher_metadata(self, doi: str, errors: list) -> Optional[dict]:
+    def _get_publisher_metadata(self, doi: str, errors: list) -> Tuple[Optional[dict], Optional[str]]:
         """
         Retrieves publisher metadata from Crossref.
         """
@@ -172,7 +173,7 @@ def _get_publisher_metadata(self, doi: str, errors: list) -> Optional[dict]:
             errors.append({"link_type": CollectionLinkType.DOI, "reason": "DOI cannot be found on Crossref"})
         except CrossrefException as e:
             logging.warning(f"CrossrefException on create_collection: {e}. Will ignore metadata.")
-            return None
+        return None, None
 
     def create_collection(
         self, owner: str, curator_name: str, collection_metadata: CollectionMetadata
@@ -189,9 +190,14 @@ def create_collection(
         validation.verify_collection_metadata(collection_metadata, errors)
 
         # TODO: Maybe switch link.type to be an enum
-        doi = next((link.uri for link in collection_metadata.links if link.type == "DOI"), None)
+        doi_link = next((link for link in collection_metadata.links if link.type == "DOI"), None)
 
-        publisher_metadata = self._get_publisher_metadata(doi, errors) if doi is not None else None
+        publisher_metadata = None
+        if doi_link:
+            publisher_metadata, doi_curie_from_crossref = self._get_publisher_metadata(doi_link.uri, errors)
+            # Ensure DOI link has correct hyperlink formation a la https://doi.org/{curie_identifier}
+            # DOI returned from Crossref may be a different (published) DOI altogether if submitted DOI is preprint
+            doi_link.uri = f"https://doi.org/{doi_curie_from_crossref}"
 
         if errors:
             raise CollectionCreationException(errors)
@@ -357,15 +363,18 @@ def update_collection_version(
 
         new_doi = None
         current_doi = None
-        if apply_doi_update:
+        if apply_doi_update and body.links is not None:  # empty list is used to reset DOI
             # Determine if the DOI has changed
             current_doi = next(
                 (link.uri for link in current_collection_version.metadata.links if link.type == "DOI"), None
             )
-            new_doi = (
-                None if body.links is None else next((link.uri for link in body.links if link.type == "DOI"), None)
-            )
-
+            # Format new doi link correctly; current link will have format https://doi.org/{curie_identifier}
+            if new_doi_curie := doi_curie_from_link(
+                next((link.uri for link in body.links if link.type == "DOI"), None)
+            ):
+                new_doi = f"https://doi.org/{new_doi_curie}"
+            else:
+                next((link.uri for link in body.links if link.type == "DOI"), None)
             if current_doi != new_doi:
                 for dataset in current_collection_version.datasets:
                     # Avoid reprocessing a dataset while it is already processing to avoid race conditions
@@ -382,13 +391,16 @@ def update_collection_version(
                             }
                         )
                         break
-
+            elif doi_link := next((link for link in body.links if link.type == "DOI"), None):
+                doi_link.uri = new_doi  # Ensures we submit DOI link in correct format
             if current_doi and new_doi is None:
                 # If the DOI was deleted, remove the publisher_metadata field
                 unset_publisher_metadata = True
-            elif (new_doi is not None) and new_doi != current_doi:
+            elif new_doi and new_doi != current_doi:
                 # If the DOI has changed, fetch and update the metadata
-                publisher_metadata_to_set = self._get_publisher_metadata(new_doi, errors)
+                publisher_metadata_to_set, doi_curie_from_crossref = self._get_publisher_metadata(new_doi, errors)
+                new_doi = f"https://doi.org/{doi_curie_from_crossref}"
+                next((link for link in body.links if link.type == "DOI")).uri = new_doi  # noqa - DOI link exists
 
         if errors:
             raise CollectionUpdateException(errors)
@@ -409,7 +421,6 @@ def update_collection_version(
         elif publisher_metadata_to_set is not None:
             self.database_provider.save_collection_publisher_metadata(version_id, publisher_metadata_to_set)
         self.database_provider.save_collection_metadata(version_id, new_metadata)
-
         if all(
             [apply_doi_update, new_doi != current_doi, FeatureFlagService.is_enabled(FeatureFlagValues.CITATION_UPDATE)]
         ):

diff --git a/backend/layers/business/business_interface.py b/backend/layers/business/business_interface.py
@@ -59,6 +59,9 @@ def get_collection_version_from_canonical(
     ) -> Optional[CollectionVersionWithDatasets]:
         pass
 
+    def _get_publisher_metadata(self, doi: str, errors: list) -> Tuple[Optional[dict], Optional[str]]:
+        pass
+
     def create_collection(
         self, owner: str, curator_name: str, collection_metadata: CollectionMetadata
     ) -> CollectionVersion:

diff --git a/backend/layers/common/doi.py b/backend/layers/common/doi.py
@@ -28,6 +28,14 @@ def curation_get_normalized_doi_url(doi: str, errors: list) -> Optional[str]:
     return f"https://doi.org/{doi}"
 
 
+def doi_curie_from_link(doi: str) -> str:
+    # Remove the https://doi.org/ (or other) domain part
+    parsed = urlparse(doi)
+    if parsed.scheme and parsed.netloc:
+        doi = parsed.path.lstrip("/")
+    return doi
+
+
 def portal_get_normalized_doi_url(doi_node: dict, errors: list) -> Optional[str]:
     """
     1. Check for DOI uniqueness in the payload
@@ -37,7 +45,7 @@ def portal_get_normalized_doi_url(doi_node: dict, errors: list) -> Optional[str]
     doi_url = doi_node["link_url"]
     parsed = urlparse(doi_url)
     if not parsed.scheme and not parsed.netloc:
-        parsed_doi = parsed.path
+        parsed_doi = parsed.path.lstrip("/")
         if not DOI_REGEX_COMPILED.match(parsed_doi):
             errors.append({"link_type": "DOI", "reason": "Invalid DOI"})
             return None

diff --git a/backend/layers/thirdparty/crossref_provider.py b/backend/layers/thirdparty/crossref_provider.py
@@ -1,16 +1,17 @@
 import html
 import logging
 from datetime import datetime
-from urllib.parse import urlparse
+from typing import Optional, Tuple
 
 import requests
 
 from backend.common.corpora_config import CorporaConfig
+from backend.layers.common.doi import doi_curie_from_link
 
 
 class CrossrefProviderInterface:
-    def fetch_metadata(self, doi: str) -> dict:
-        pass
+    def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]:
+        return None, None
 
     def fetch_preprint_published_doi(self, doi):
         pass
@@ -54,10 +55,6 @@ def parse_date_parts(obj):
         return (year, month, day)
 
     def _fetch_crossref_payload(self, doi):
-        # Remove the https://doi.org part
-        parsed = urlparse(doi)
-        if parsed.scheme and parsed.netloc:
-            doi = parsed.path
 
         if self.crossref_api_key is None:
             logging.info("No Crossref API key found, skipping metadata fetching.")
@@ -77,16 +74,19 @@ def _fetch_crossref_payload(self, doi):
 
         return res
 
-    def fetch_metadata(self, doi: str) -> dict:
+    def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]:
         """
         Fetches and extracts publisher metadata from Crossref for a specified DOI.
         If the Crossref API URI isn't in the configuration, we will just return an empty object.
         This is to avoid calling Crossref in non-production environments.
+        :param doi: str - DOI uri link or curie identifier
+        return: tuple - publisher metadata dict and DOI curie identifier
         """
+        doi_curie = doi_curie_from_link(doi)
 
-        res = self._fetch_crossref_payload(doi)
+        res = self._fetch_crossref_payload(doi_curie)
         if not res:
-            return
+            return None, None
 
         try:
             message = res.json()["message"]
@@ -135,32 +135,32 @@ def fetch_metadata(self, doi: str) -> dict:
 
             # Preprint
             is_preprint = message.get("subtype") == "preprint"
-
-            return {
-                "authors": parsed_authors,
-                "published_year": published_year,
-                "published_month": published_month,
-                "published_day": published_day,
-                "published_at": datetime.timestamp(datetime(published_year, published_month, published_day)),
-                "journal": journal,
-                "is_preprint": is_preprint,
-            }
+            if is_preprint:
+                published_metadata, published_doi_curie = self.fetch_published_metadata(message)
+                if published_metadata and published_doi_curie:  # if not, use preprint doi curie
+                    return published_metadata, published_doi_curie
+
+            return (
+                {
+                    "authors": parsed_authors,
+                    "published_year": published_year,
+                    "published_month": published_month,
+                    "published_day": published_day,
+                    "published_at": datetime.timestamp(datetime(published_year, published_month, published_day)),
+                    "journal": journal,
+                    "is_preprint": is_preprint,
+                },
+                doi_curie,
+            )
         except Exception as e:
             raise CrossrefParseException("Cannot parse metadata from Crossref") from e
 
-    def fetch_preprint_published_doi(self, doi):
-        """
-        Given a preprint DOI, returns the DOI of the published paper, if available.
-        """
-
-        res = self._fetch_crossref_payload(doi)
-        message = res.json()["message"]
-        is_preprint = message.get("subtype") == "preprint"
-
-        if is_preprint:
-            try:
-                published_doi = message["relation"]["is-preprint-of"]
-                if published_doi[0]["id-type"] == "doi":
-                    return published_doi[0]["id"]
-            except Exception:
-                pass
+    def fetch_published_metadata(self, doi_response_message: dict) -> Tuple[Optional[dict], Optional[str]]:
+        try:
+            published_doi = doi_response_message["relation"]["is-preprint-of"]
+            # the new DOI to query for ...
+            for entity in published_doi:
+                if entity["id-type"] == "doi":
+                    return self.fetch_metadata(entity["id"])
+        except Exception:  # if fetch of published doi errors out, just use preprint doi
+            return None, None
diff --git a/tests/unit/backend/layers/api/test_curation_api.py b/tests/unit/backend/layers/api/test_curation_api.py
@@ -1297,10 +1297,12 @@ def test__update_collection__doi__OK(self):
             {"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"},
         ]
         new_doi = "10.1016"  # a real DOI (CURIE reference)
+        self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020"))
         collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id
         original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json
         self.assertEqual(initial_doi, original_collection["doi"])
         metadata = {"doi": new_doi}
+        self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016"))
         response = self.app.patch(
             f"/curation/v1/collections/{collection_id}",
             json=metadata,
@@ -1315,6 +1317,7 @@ def test__update_collection__consortia__OK(self):
         links = [
             {"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"},
         ]
+        self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020"))
         collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id
         original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json
         self.assertEqual(initial_consortia, original_collection["consortia"])
@@ -1333,6 +1336,7 @@ def test__remove_collection__consortia__OK(self):
         links = [
             {"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"},
         ]
+        self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020"))
         collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id
         original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json
         self.assertEqual(initial_consortia, original_collection["consortia"])
@@ -1351,6 +1355,7 @@ def test__update_public_collection_verify_fix_consortia_sort_order_OK(self):
         links = [
             {"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"},
         ]
+        self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020"))
         collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id
         original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json
         self.assertEqual(initial_consortia, original_collection["consortia"])
@@ -1367,6 +1372,9 @@ def test__update_collection__doi_is_not_CURIE_reference__BAD_REQUEST(self):
         links = [
             {"link_name": "doi", "link_type": "DOI", "link_url": "http://doi.doi/10.1011/something"},
         ]
+        self.crossref_provider.fetch_metadata = Mock(
+            return_value=(generate_mock_publisher_metadata(), "10.1011/something")
+        )
         collection = self.generate_collection(links=links, visibility="PRIVATE")
         collection_id = collection.collection_id
         original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json
@@ -1385,9 +1393,8 @@ def test__update_collection__links_None_does_not_remove_publisher_metadata(self)
         links = [
             {"link_name": "doi", "link_type": "DOI", "link_url": "http://doi.doi/10.1011/something"},
         ]
-
         mock_publisher_metadata = generate_mock_publisher_metadata()
-        self.crossref_provider.fetch_metadata = Mock(return_value=mock_publisher_metadata)
+        self.crossref_provider.fetch_metadata = Mock(return_value=(mock_publisher_metadata, "10.1011/something"))
 
         collection = self.generate_collection(links=links, visibility="PRIVATE")
         collection_id = collection.collection_id
@@ -1414,7 +1421,8 @@ def test__update_collection__doi_does_not_exist__BAD_REQUEST(self):
             {"link_name": "new link", "link_type": "RAW_DATA", "link_url": "http://brand_new_link.place"},
         ]
         mock_publisher_metadata = generate_mock_publisher_metadata()
-        self.crossref_provider.fetch_metadata = Mock(return_value=mock_publisher_metadata)
+        self.crossref_provider.fetch_metadata = Mock(return_value=(mock_publisher_metadata, "10.1011/something"))
+
         collection = self.generate_collection(links=links, visibility="PRIVATE")
         self.assertIsNotNone(collection.publisher_metadata)
         collection_id = collection.collection_id
@@ -1780,6 +1788,9 @@ def test_get_dataset_no_assets(self):
         self.assertEqual([], body["assets"])
 
     def test_get_all_datasets_200(self):
+        self.crossref_provider.fetch_metadata = Mock(
+            return_value=(generate_mock_publisher_metadata(), "12.3456/j.celrep")
+        )
         published_collection_1 = self.generate_published_collection(
             add_datasets=2,
             metadata=CollectionMetadata(
@@ -1791,6 +1802,9 @@ def test_get_all_datasets_200(self):
                 ["Consortia 1", "Consortia 2"],
             ),
         )
+        self.crossref_provider.fetch_metadata = Mock(
+            return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep")
+        )
         published_collection_2 = self.generate_published_collection(
             owner="other owner",
             curator_name="other curator",
@@ -1880,6 +1894,9 @@ def test_get_all_datasets_200(self):
             self.assertEqual(expected_assets, dataset["assets"])
 
     def test_get_datasets_by_schema_200(self):
+        self.crossref_provider.fetch_metadata = Mock(
+            return_value=(generate_mock_publisher_metadata(), "12.3456/j.celrep")
+        )
         published_collection_1 = self.generate_published_collection(
             add_datasets=2,
             metadata=CollectionMetadata(
@@ -1891,6 +1908,9 @@ def test_get_datasets_by_schema_200(self):
                 ["Consortia 1", "Consortia 2"],
             ),
         )
+        self.crossref_provider.fetch_metadata = Mock(
+            return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep")
+        )
         published_collection_2 = self.generate_published_collection(
             owner="other owner",
             curator_name="other curator",
@@ -1905,6 +1925,9 @@ def test_get_datasets_by_schema_200(self):
             ),
             dataset_schema_version="3.1.0",
         )
+        self.crossref_provider.fetch_metadata = Mock(
+            return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep")
+        )
         published_collection_3 = self.generate_published_collection(
             owner="other owner",
             curator_name="other curator",