From 59c3b492ca4edaf20735d88005efae17f5ce6d4d Mon Sep 17 00:00:00 2001 From: Daniel Hegeman Date: Fri, 5 Jan 2024 17:21:53 -0800 Subject: [PATCH] feat: fetch published doi if preprint (#6311) - enforce https://doi.org/ domain prefix - use preprint doi if published doi cannot be retrieved --- backend/layers/business/business.py | 37 ++++-- backend/layers/business/business_interface.py | 3 + backend/layers/common/doi.py | 10 +- .../layers/thirdparty/crossref_provider.py | 72 +++++------ .../backend/layers/api/test_curation_api.py | 29 ++++- .../backend/layers/api/test_portal_api.py | 43 +++++-- .../backend/layers/business/test_business.py | 31 ++--- .../thirdparty/test_crossref_provider.py | 112 +++++++++++++++++- .../unit/processing/test_process_validate.py | 3 +- 9 files changed, 255 insertions(+), 85 deletions(-) diff --git a/backend/layers/business/business.py b/backend/layers/business/business.py index 67df9181e38da..0ee28dd57f7ca 100644 --- a/backend/layers/business/business.py +++ b/backend/layers/business/business.py @@ -37,6 +37,7 @@ ) from backend.layers.common import validation from backend.layers.common.cleanup import sanitize +from backend.layers.common.doi import doi_curie_from_link from backend.layers.common.entities import ( CanonicalCollection, CollectionId, @@ -162,7 +163,7 @@ def trigger_dataset_artifact_update( current_dataset_version_id, new_dataset_version_id, metadata_update ) - def _get_publisher_metadata(self, doi: str, errors: list) -> Optional[dict]: + def _get_publisher_metadata(self, doi: str, errors: list) -> Tuple[Optional[dict], Optional[str]]: """ Retrieves publisher metadata from Crossref. """ @@ -172,7 +173,7 @@ def _get_publisher_metadata(self, doi: str, errors: list) -> Optional[dict]: errors.append({"link_type": CollectionLinkType.DOI, "reason": "DOI cannot be found on Crossref"}) except CrossrefException as e: logging.warning(f"CrossrefException on create_collection: {e}. Will ignore metadata.") - return None + return None, None def create_collection( self, owner: str, curator_name: str, collection_metadata: CollectionMetadata @@ -189,9 +190,14 @@ def create_collection( validation.verify_collection_metadata(collection_metadata, errors) # TODO: Maybe switch link.type to be an enum - doi = next((link.uri for link in collection_metadata.links if link.type == "DOI"), None) + doi_link = next((link for link in collection_metadata.links if link.type == "DOI"), None) - publisher_metadata = self._get_publisher_metadata(doi, errors) if doi is not None else None + publisher_metadata = None + if doi_link: + publisher_metadata, doi_curie_from_crossref = self._get_publisher_metadata(doi_link.uri, errors) + # Ensure DOI link has correct hyperlink formation a la https://doi.org/{curie_identifier} + # DOI returned from Crossref may be a different (published) DOI altogether if submitted DOI is preprint + doi_link.uri = f"https://doi.org/{doi_curie_from_crossref}" if errors: raise CollectionCreationException(errors) @@ -357,15 +363,18 @@ def update_collection_version( new_doi = None current_doi = None - if apply_doi_update: + if apply_doi_update and body.links is not None: # empty list is used to reset DOI # Determine if the DOI has changed current_doi = next( (link.uri for link in current_collection_version.metadata.links if link.type == "DOI"), None ) - new_doi = ( - None if body.links is None else next((link.uri for link in body.links if link.type == "DOI"), None) - ) - + # Format new doi link correctly; current link will have format https://doi.org/{curie_identifier} + if new_doi_curie := doi_curie_from_link( + next((link.uri for link in body.links if link.type == "DOI"), None) + ): + new_doi = f"https://doi.org/{new_doi_curie}" + else: + next((link.uri for link in body.links if link.type == "DOI"), None) if current_doi != new_doi: for dataset in current_collection_version.datasets: # Avoid reprocessing a dataset while it is already processing to avoid race conditions @@ -382,13 +391,16 @@ def update_collection_version( } ) break - + elif doi_link := next((link for link in body.links if link.type == "DOI"), None): + doi_link.uri = new_doi # Ensures we submit DOI link in correct format if current_doi and new_doi is None: # If the DOI was deleted, remove the publisher_metadata field unset_publisher_metadata = True - elif (new_doi is not None) and new_doi != current_doi: + elif new_doi and new_doi != current_doi: # If the DOI has changed, fetch and update the metadata - publisher_metadata_to_set = self._get_publisher_metadata(new_doi, errors) + publisher_metadata_to_set, doi_curie_from_crossref = self._get_publisher_metadata(new_doi, errors) + new_doi = f"https://doi.org/{doi_curie_from_crossref}" + next((link for link in body.links if link.type == "DOI")).uri = new_doi # noqa - DOI link exists if errors: raise CollectionUpdateException(errors) @@ -409,7 +421,6 @@ def update_collection_version( elif publisher_metadata_to_set is not None: self.database_provider.save_collection_publisher_metadata(version_id, publisher_metadata_to_set) self.database_provider.save_collection_metadata(version_id, new_metadata) - if all( [apply_doi_update, new_doi != current_doi, FeatureFlagService.is_enabled(FeatureFlagValues.CITATION_UPDATE)] ): diff --git a/backend/layers/business/business_interface.py b/backend/layers/business/business_interface.py index 67021853a0a5c..e286a7b1dbbd1 100644 --- a/backend/layers/business/business_interface.py +++ b/backend/layers/business/business_interface.py @@ -59,6 +59,9 @@ def get_collection_version_from_canonical( ) -> Optional[CollectionVersionWithDatasets]: pass + def _get_publisher_metadata(self, doi: str, errors: list) -> Tuple[Optional[dict], Optional[str]]: + pass + def create_collection( self, owner: str, curator_name: str, collection_metadata: CollectionMetadata ) -> CollectionVersion: diff --git a/backend/layers/common/doi.py b/backend/layers/common/doi.py index 72416e44d61bd..8cc0717606351 100644 --- a/backend/layers/common/doi.py +++ b/backend/layers/common/doi.py @@ -28,6 +28,14 @@ def curation_get_normalized_doi_url(doi: str, errors: list) -> Optional[str]: return f"https://doi.org/{doi}" +def doi_curie_from_link(doi: str) -> str: + # Remove the https://doi.org/ (or other) domain part + parsed = urlparse(doi) + if parsed.scheme and parsed.netloc: + doi = parsed.path.lstrip("/") + return doi + + def portal_get_normalized_doi_url(doi_node: dict, errors: list) -> Optional[str]: """ 1. Check for DOI uniqueness in the payload @@ -37,7 +45,7 @@ def portal_get_normalized_doi_url(doi_node: dict, errors: list) -> Optional[str] doi_url = doi_node["link_url"] parsed = urlparse(doi_url) if not parsed.scheme and not parsed.netloc: - parsed_doi = parsed.path + parsed_doi = parsed.path.lstrip("/") if not DOI_REGEX_COMPILED.match(parsed_doi): errors.append({"link_type": "DOI", "reason": "Invalid DOI"}) return None diff --git a/backend/layers/thirdparty/crossref_provider.py b/backend/layers/thirdparty/crossref_provider.py index 6401184a078c9..1aa84c45cb424 100644 --- a/backend/layers/thirdparty/crossref_provider.py +++ b/backend/layers/thirdparty/crossref_provider.py @@ -1,16 +1,17 @@ import html import logging from datetime import datetime -from urllib.parse import urlparse +from typing import Optional, Tuple import requests from backend.common.corpora_config import CorporaConfig +from backend.layers.common.doi import doi_curie_from_link class CrossrefProviderInterface: - def fetch_metadata(self, doi: str) -> dict: - pass + def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]: + return None, None def fetch_preprint_published_doi(self, doi): pass @@ -54,10 +55,6 @@ def parse_date_parts(obj): return (year, month, day) def _fetch_crossref_payload(self, doi): - # Remove the https://doi.org part - parsed = urlparse(doi) - if parsed.scheme and parsed.netloc: - doi = parsed.path if self.crossref_api_key is None: logging.info("No Crossref API key found, skipping metadata fetching.") @@ -77,16 +74,19 @@ def _fetch_crossref_payload(self, doi): return res - def fetch_metadata(self, doi: str) -> dict: + def fetch_metadata(self, doi: str) -> Tuple[Optional[dict], Optional[str]]: """ Fetches and extracts publisher metadata from Crossref for a specified DOI. If the Crossref API URI isn't in the configuration, we will just return an empty object. This is to avoid calling Crossref in non-production environments. + :param doi: str - DOI uri link or curie identifier + return: tuple - publisher metadata dict and DOI curie identifier """ + doi_curie = doi_curie_from_link(doi) - res = self._fetch_crossref_payload(doi) + res = self._fetch_crossref_payload(doi_curie) if not res: - return + return None, None try: message = res.json()["message"] @@ -135,32 +135,32 @@ def fetch_metadata(self, doi: str) -> dict: # Preprint is_preprint = message.get("subtype") == "preprint" - - return { - "authors": parsed_authors, - "published_year": published_year, - "published_month": published_month, - "published_day": published_day, - "published_at": datetime.timestamp(datetime(published_year, published_month, published_day)), - "journal": journal, - "is_preprint": is_preprint, - } + if is_preprint: + published_metadata, published_doi_curie = self.fetch_published_metadata(message) + if published_metadata and published_doi_curie: # if not, use preprint doi curie + return published_metadata, published_doi_curie + + return ( + { + "authors": parsed_authors, + "published_year": published_year, + "published_month": published_month, + "published_day": published_day, + "published_at": datetime.timestamp(datetime(published_year, published_month, published_day)), + "journal": journal, + "is_preprint": is_preprint, + }, + doi_curie, + ) except Exception as e: raise CrossrefParseException("Cannot parse metadata from Crossref") from e - def fetch_preprint_published_doi(self, doi): - """ - Given a preprint DOI, returns the DOI of the published paper, if available. - """ - - res = self._fetch_crossref_payload(doi) - message = res.json()["message"] - is_preprint = message.get("subtype") == "preprint" - - if is_preprint: - try: - published_doi = message["relation"]["is-preprint-of"] - if published_doi[0]["id-type"] == "doi": - return published_doi[0]["id"] - except Exception: - pass + def fetch_published_metadata(self, doi_response_message: dict) -> Tuple[Optional[dict], Optional[str]]: + try: + published_doi = doi_response_message["relation"]["is-preprint-of"] + # the new DOI to query for ... + for entity in published_doi: + if entity["id-type"] == "doi": + return self.fetch_metadata(entity["id"]) + except Exception: # if fetch of published doi errors out, just use preprint doi + return None, None diff --git a/tests/unit/backend/layers/api/test_curation_api.py b/tests/unit/backend/layers/api/test_curation_api.py index 25d55544ed6c2..f21eb80320fca 100644 --- a/tests/unit/backend/layers/api/test_curation_api.py +++ b/tests/unit/backend/layers/api/test_curation_api.py @@ -1297,10 +1297,12 @@ def test__update_collection__doi__OK(self): {"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"}, ] new_doi = "10.1016" # a real DOI (CURIE reference) + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020")) collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json self.assertEqual(initial_doi, original_collection["doi"]) metadata = {"doi": new_doi} + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) response = self.app.patch( f"/curation/v1/collections/{collection_id}", json=metadata, @@ -1315,6 +1317,7 @@ def test__update_collection__consortia__OK(self): links = [ {"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"}, ] + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020")) collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json self.assertEqual(initial_consortia, original_collection["consortia"]) @@ -1333,6 +1336,7 @@ def test__remove_collection__consortia__OK(self): links = [ {"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"}, ] + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020")) collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json self.assertEqual(initial_consortia, original_collection["consortia"]) @@ -1351,6 +1355,7 @@ def test__update_public_collection_verify_fix_consortia_sort_order_OK(self): links = [ {"link_name": "new doi", "link_type": "DOI", "link_url": "http://doi.org/10.2020"}, ] + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.2020")) collection_id = self.generate_collection(links=links, visibility="PRIVATE").collection_id original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json self.assertEqual(initial_consortia, original_collection["consortia"]) @@ -1367,6 +1372,9 @@ def test__update_collection__doi_is_not_CURIE_reference__BAD_REQUEST(self): links = [ {"link_name": "doi", "link_type": "DOI", "link_url": "http://doi.doi/10.1011/something"}, ] + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "10.1011/something") + ) collection = self.generate_collection(links=links, visibility="PRIVATE") collection_id = collection.collection_id original_collection = self.app.get(f"curation/v1/collections/{collection_id}").json @@ -1385,9 +1393,8 @@ def test__update_collection__links_None_does_not_remove_publisher_metadata(self) links = [ {"link_name": "doi", "link_type": "DOI", "link_url": "http://doi.doi/10.1011/something"}, ] - mock_publisher_metadata = generate_mock_publisher_metadata() - self.crossref_provider.fetch_metadata = Mock(return_value=mock_publisher_metadata) + self.crossref_provider.fetch_metadata = Mock(return_value=(mock_publisher_metadata, "10.1011/something")) collection = self.generate_collection(links=links, visibility="PRIVATE") collection_id = collection.collection_id @@ -1414,7 +1421,8 @@ def test__update_collection__doi_does_not_exist__BAD_REQUEST(self): {"link_name": "new link", "link_type": "RAW_DATA", "link_url": "http://brand_new_link.place"}, ] mock_publisher_metadata = generate_mock_publisher_metadata() - self.crossref_provider.fetch_metadata = Mock(return_value=mock_publisher_metadata) + self.crossref_provider.fetch_metadata = Mock(return_value=(mock_publisher_metadata, "10.1011/something")) + collection = self.generate_collection(links=links, visibility="PRIVATE") self.assertIsNotNone(collection.publisher_metadata) collection_id = collection.collection_id @@ -1780,6 +1788,9 @@ def test_get_dataset_no_assets(self): self.assertEqual([], body["assets"]) def test_get_all_datasets_200(self): + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "12.3456/j.celrep") + ) published_collection_1 = self.generate_published_collection( add_datasets=2, metadata=CollectionMetadata( @@ -1791,6 +1802,9 @@ def test_get_all_datasets_200(self): ["Consortia 1", "Consortia 2"], ), ) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep") + ) published_collection_2 = self.generate_published_collection( owner="other owner", curator_name="other curator", @@ -1880,6 +1894,9 @@ def test_get_all_datasets_200(self): self.assertEqual(expected_assets, dataset["assets"]) def test_get_datasets_by_schema_200(self): + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "12.3456/j.celrep") + ) published_collection_1 = self.generate_published_collection( add_datasets=2, metadata=CollectionMetadata( @@ -1891,6 +1908,9 @@ def test_get_datasets_by_schema_200(self): ["Consortia 1", "Consortia 2"], ), ) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep") + ) published_collection_2 = self.generate_published_collection( owner="other owner", curator_name="other curator", @@ -1905,6 +1925,9 @@ def test_get_datasets_by_schema_200(self): ), dataset_schema_version="3.1.0", ) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata(), "78.91011/j.celrep") + ) published_collection_3 = self.generate_published_collection( owner="other owner", curator_name="other curator", diff --git a/tests/unit/backend/layers/api/test_portal_api.py b/tests/unit/backend/layers/api/test_portal_api.py index b27b232582b4e..6ba4a350c7ac8 100644 --- a/tests/unit/backend/layers/api/test_portal_api.py +++ b/tests/unit/backend/layers/api/test_portal_api.py @@ -356,6 +356,7 @@ def test__post_collection_returns_id_on_success(self): "links": [{"link_name": "DOI Link", "link_url": "http://doi.org/10.1016", "link_type": "DOI"}], "consortia": ["Consortia 1"], } + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) json_data = json.dumps(data) response = self.app.post( test_url.url, @@ -414,6 +415,7 @@ def test__post_collection_sorts_consortia(self): ], "consortia": ["Consortia 3", "Consortia 1"], } + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016/foo")) json_data = json.dumps(data) response = self.app.post( test_url.url, @@ -439,6 +441,7 @@ def test__post_collection_normalizes_doi(self): ], "consortia": ["Consortia 1"], } + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016/foo")) json_data = json.dumps(data) response = self.app.post( test_url.url, @@ -586,6 +589,7 @@ def test__post_collection_adds_publisher_metadata(self): "links": [{"link_name": "DOI Link", "link_url": "http://doi.org/10.1016", "link_type": "DOI"}], "consortia": ["Consortia 1"], } + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) json_data = json.dumps(data) response = self.app.post( test_url.url, @@ -670,6 +674,7 @@ def test__can_retrieve_created_collection(self): {"link_name": "DOI Link 2", "link_url": "http://doi.org/10.1017", "link_type": "DOI"}, ], } + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1017")) response = self.app.post(test_url.url, headers=headers, data=json.dumps(data)) self.assertEqual(201, response.status_code) collection_id = json.loads(response.data)["collection_id"] @@ -720,6 +725,7 @@ def test__create_collection_strip_string_fields(self): ], "consortia": ["Consortia 1 "], } + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1017")) response = self.app.post(test_url.url, headers=headers, data=json.dumps(data)) self.assertEqual(201, response.status_code) collection_id = json.loads(response.data)["collection_id"] @@ -1170,6 +1176,7 @@ def test__update_collection__OK(self): "consortia", ] headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) # Update the collection expected_body = { @@ -1177,7 +1184,7 @@ def test__update_collection__OK(self): "description": "This is a test collection", "contact_name": "person human", "contact_email": "person@human.com", - "links": [{"link_name": "DOI Link", "link_url": "http://doi.org/10.1016", "link_type": "DOI"}], + "links": [{"link_name": "DOI Link", "link_url": "https://doi.org/10.1016", "link_type": "DOI"}], "consortia": ["Consortia 1"], } data = json.dumps(expected_body) @@ -1190,6 +1197,7 @@ def test__update_collection__OK(self): def test__update_collection_strip_string_fields__OK(self): collection = self.generate_unpublished_collection() headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) # Update the collection new_body = { @@ -1211,10 +1219,11 @@ def test__update_collection_strip_string_fields__OK(self): self.assertEqual(new_body["contact_email"].strip(), actual_body["contact_email"]) self.assertEqual(["Consortia 1"], actual_body["consortia"]) self.assertEqual( - [{"link_name": "DOI Link", "link_url": "http://doi.org/10.1016", "link_type": "DOI"}], actual_body["links"] + [{"link_name": "DOI Link", "link_url": "https://doi.org/10.1016", "link_type": "DOI"}], actual_body["links"] ) def test__update_collection_partial__OK(self): + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "123")) collection = self.generate_unpublished_collection(links=[Link("Link 1", "DOI", "http://doi.org/123")]) headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} @@ -1353,9 +1362,10 @@ def test__update_with_invalid_collection_consortia(self): def test__update_collection_links__OK(self): collection = self.generate_unpublished_collection() headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) # add links - links = [{"link_name": "DOI Link", "link_url": "http://doi.org/10.1016", "link_type": "DOI"}] + links = [{"link_name": "DOI Link", "link_url": "https://doi.org/10.1016", "link_type": "DOI"}] data = json.dumps({"links": links}) response = self.app.put(f"/dp/v1/collections/{collection.version_id.id}", data=data, headers=headers) self.assertEqual(200, response.status_code) @@ -1369,7 +1379,7 @@ def test__update_collection_links__OK(self): self.assertEqual(links, json.loads(response.data)["links"]) # update links - links = [{"link_name": "New name", "link_url": "http://doi.org/10.1016", "link_type": "DOI"}] + links = [{"link_name": "New name", "link_url": "https://doi.org/10.1016", "link_type": "DOI"}] data = json.dumps({"links": links}) response = self.app.put(f"/dp/v1/collections/{collection.version_id.id}", data=data, headers=headers) self.assertEqual(200, response.status_code) @@ -1378,7 +1388,7 @@ def test__update_collection_links__OK(self): # all together links = [ {"link_name": "Link 1", "link_url": "http://link.com", "link_type": "OTHER"}, - {"link_name": "DOI Link", "link_url": "http://doi.org/10.1016", "link_type": "DOI"}, + {"link_name": "DOI Link", "link_url": "https://doi.org/10.1016", "link_type": "DOI"}, ] data = json.dumps({"links": links}) response = self.app.put(f"/dp/v1/collections/{collection.version_id.id}", data=data, headers=headers) @@ -1396,7 +1406,9 @@ def test__update_collection_links__OK(self): # ✅ def test__update_collection_new_doi_updates_metadata(self): # Generate a collection with "Old Journal" as publisher metadata - self.crossref_provider.fetch_metadata = Mock(return_value=generate_mock_publisher_metadata("Old Journal")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata("Old Journal"), "123") + ) collection = self.generate_unpublished_collection(links=[Link("Link 1", "DOI", "http://doi.org/123")]) self.assertIsNotNone(collection.publisher_metadata) @@ -1407,6 +1419,9 @@ def test__update_collection_new_doi_updates_metadata(self): self.crossref_provider.fetch_metadata = Mock(return_value=generate_mock_publisher_metadata("New Journal")) headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata("New Journal"), "10.1234/5678") + ) response = self.app.put( f"/dp/v1/collections/{collection.version_id}", data=json.dumps({"links": [{"link_name": "Link 1", "link_url": "10.1234/5678", "link_type": "DOI"}]}), @@ -1424,15 +1439,20 @@ def test__update_collection_new_doi_updates_metadata(self): # ✅ def test__update_collection_remove_doi_deletes_metadata(self): # Generate a collection with "Old Journal" as publisher metadata - self.crossref_provider.fetch_metadata = Mock(return_value=generate_mock_publisher_metadata("Old Journal")) - collection = self.generate_unpublished_collection(links=[Link("Link 1", "DOI", "http://doi.org/123")]) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata("Old Journal"), "123") + ) + + collection = self.generate_unpublished_collection(links=[Link("Link 1", "DOI", "https://doi.org/123")]) self.assertIsNotNone(collection.publisher_metadata) if collection.publisher_metadata: # pylance self.assertEqual("Old Journal", collection.publisher_metadata["journal"]) # From now on, Crossref will return `New Journal` - self.crossref_provider.fetch_metadata = Mock(return_value=generate_mock_publisher_metadata("New Journal")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata("This will"), "not be called") + ) headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()} # We're passing an empty links object, therefore the DOI is deleted @@ -1452,7 +1472,9 @@ def test__update_collection_remove_doi_deletes_metadata(self): # ✅ def test__update_collection_same_doi_does_not_update_metadata(self): - self.crossref_provider.fetch_metadata = Mock(return_value=generate_mock_publisher_metadata("Old Journal")) + self.crossref_provider.fetch_metadata = Mock( + return_value=(generate_mock_publisher_metadata("Old Journal"), "123") + ) collection = self.generate_unpublished_collection(links=[Link("Link 1", "DOI", "http://doi.org/123")]) @@ -2323,6 +2345,7 @@ def test__start_revision_of_a_collection_w_links__201(self): Link("Link 1", "OTHER", "http://link.good"), Link("DOI Link", "DOI", "http://doi.org/10.1016"), ] + self.crossref_provider.fetch_metadata = Mock(return_value=(generate_mock_publisher_metadata(), "10.1016")) published_collection = self.generate_published_collection(links=links, add_datasets=2) # Starts a revision diff --git a/tests/unit/backend/layers/business/test_business.py b/tests/unit/backend/layers/business/test_business.py index 70b642fe8c100..b17917faebad0 100644 --- a/tests/unit/backend/layers/business/test_business.py +++ b/tests/unit/backend/layers/business/test_business.py @@ -361,21 +361,21 @@ def test_create_collection_with_valid_doi_ok(self): A collection can be created with a valid DOI, and the publisher metadata will be added to the collection """ - links_with_doi = [Link("test doi", "DOI", "http://good.doi")] + links_with_doi = [Link("test doi", "DOI", "https://doi.org/good/doi")] self.sample_collection_metadata.links = links_with_doi expected_publisher_metadata = {"authors": ["Test Author"]} - self.crossref_provider.fetch_metadata = Mock(return_value=expected_publisher_metadata) + self.crossref_provider.fetch_metadata = Mock(return_value=(expected_publisher_metadata, "good/doi")) collection = self.business_logic.create_collection( test_user_name, test_curator_name, self.sample_collection_metadata ) - self.crossref_provider.fetch_metadata.assert_called_with("http://good.doi") + self.crossref_provider.fetch_metadata.assert_called_with("https://doi.org/good/doi") collection_from_database = self.database_provider.get_collection_version(collection.version_id) self.assertEqual(1, len(collection_from_database.metadata.links)) - self.assertEqual(collection_from_database.metadata.links[0].uri, "http://good.doi") + self.assertEqual(collection_from_database.metadata.links[0].uri, "https://doi.org/good/doi") self.assertIsNotNone(collection_from_database.publisher_metadata) self.assertEqual(collection_from_database.publisher_metadata, expected_publisher_metadata) @@ -634,11 +634,11 @@ def test_update_collection_same_doi(self): A collection updated with the same DOI should not trigger a Crossref call """ metadata = self.sample_collection_metadata - links = [Link("test doi", "DOI", "http://test.doi")] + links = [Link("test doi", "DOI", "https://doi.org/test/doi")] metadata.links = links expected_publisher_metadata = {"authors": ["Test Author"]} - self.crossref_provider.fetch_metadata = Mock(return_value=expected_publisher_metadata) + self.crossref_provider.fetch_metadata = Mock(return_value=(expected_publisher_metadata, "test/doi")) # We need to call `business_logic.create_collection` so that the publisher metadata is populated version = self.business_logic.create_collection(test_user_name, test_curator_name, metadata) @@ -665,10 +665,10 @@ def test_update_collection_change_doi(self): A collection updated with a new DOI should get new publisher metadata from Crossref """ metadata = self.sample_collection_metadata - links = [Link("test doi", "DOI", "http://test.doi")] + links = [Link("test doi", "DOI", "http://doi.org/test.doi")] metadata.links = links - self.crossref_provider.fetch_metadata = Mock(return_value={"authors": ["Test Author"]}) + self.crossref_provider.fetch_metadata = Mock(return_value=({"authors": ["Test Author"]}, "test.doi")) # We need to call `business_logic.create_collection` so that the publisher metadata is populated version = self.business_logic.create_collection(test_user_name, test_curator_name, metadata) @@ -680,20 +680,19 @@ def test_update_collection_change_doi(self): description=None, contact_name=None, contact_email=None, - links=[Link("new test doi", "DOI", "http://new.test.doi")], + links=[Link("new test doi", "DOI", "http://doi.org/new.test.doi")], consortia=None, ) - expected_updated_publisher_metadata = {"authors": ["New Test Author"]} + expected_updated_publisher_metadata = ({"authors": ["New Test Author"]}, "new.test.doi") self.crossref_provider.fetch_metadata = Mock(return_value=expected_updated_publisher_metadata) self.batch_job_provider.start_metadata_update_batch_job = Mock() - self.business_logic.update_collection_version(version.version_id, body) self.crossref_provider.fetch_metadata.assert_called_once() self.batch_job_provider.start_metadata_update_batch_job.assert_not_called() # no datasets to update updated_version = self.database_provider.get_collection_version(version.version_id) - self.assertEqual(updated_version.publisher_metadata, expected_updated_publisher_metadata) + self.assertEqual(updated_version.publisher_metadata, expected_updated_publisher_metadata[0]) def test_update_collection_change_doi__trigger_dataset_artifact_updates(self): """ @@ -707,13 +706,14 @@ def test_update_collection_change_doi__trigger_dataset_artifact_updates(self): description=None, contact_name=None, contact_email=None, - links=[Link("new test doi", "DOI", "http://new.test.doi")], + links=[Link("new test doi", "DOI", "http://doi.org/new.test.doi")], consortia=None, ) self.batch_job_provider.start_metadata_update_batch_job = Mock() self.business_logic.generate_dataset_citation = Mock(return_value="test citation") - self.business_logic.update_collection_version(revision.version_id, body) + self.crossref_provider.fetch_metadata = Mock(return_value=({"authors": ["New Test Author"]}, "new.test.doi")) + self.business_logic.update_collection_version(revision.version_id, body) assert self.batch_job_provider.start_metadata_update_batch_job.call_count == 2 self.batch_job_provider.start_metadata_update_batch_job.assert_has_calls( [ @@ -737,9 +737,10 @@ def test_update_published_collection_fail__dataset_status_pending(self): description=None, contact_name=None, contact_email=None, - links=[Link("new test doi", "DOI", "http://new.test.doi")], + links=[Link("new test doi", "DOI", "http://doi.org/new.test.doi")], consortia=None, ) + self.crossref_provider.fetch_metadata = Mock(return_value=({"authors": ["New Test Author"]}, "new.test.doi")) with self.assertRaises(CollectionUpdateException): self.business_logic.update_collection_version(revision.version_id, body) diff --git a/tests/unit/backend/layers/thirdparty/test_crossref_provider.py b/tests/unit/backend/layers/thirdparty/test_crossref_provider.py index 355beee4f27c7..4ba668c999bc2 100644 --- a/tests/unit/backend/layers/thirdparty/test_crossref_provider.py +++ b/tests/unit/backend/layers/thirdparty/test_crossref_provider.py @@ -1,3 +1,4 @@ +import copy import json import unittest from unittest.mock import patch @@ -18,8 +19,8 @@ class TestCrossrefProvider(unittest.TestCase): @patch("backend.layers.thirdparty.crossref_provider.requests.get") def test__provider_does_not_call_crossref_in_test(self, mock_get): provider = CrossrefProvider() - res = provider.fetch_metadata("test_doi") - self.assertIsNone(res) + metadata, doi = provider.fetch_metadata("test_doi") + self.assertIsNone(metadata) mock_get.assert_not_called() @patch("backend.layers.thirdparty.crossref_provider.requests.get") @@ -56,7 +57,8 @@ def test__provider_calls_crossref_if_api_key_defined(self, mock_config, mock_get mock_get.return_value = response provider = CrossrefProvider() - res = provider.fetch_metadata("test_doi") + res, doi_curie_from_crossref = provider.fetch_metadata("test_doi") + self.assertEqual("test_doi", doi_curie_from_crossref) mock_get.assert_called_once() expected_response = { @@ -71,6 +73,104 @@ def test__provider_calls_crossref_if_api_key_defined(self, mock_config, mock_get self.assertDictEqual(expected_response, res) + @patch("backend.layers.thirdparty.crossref_provider.requests.get") + @patch("backend.layers.thirdparty.crossref_provider.CorporaConfig") + def test__published_doi_used_if_exists_for_preprint(self, mock_config, mock_get): + + # Defining a mocked CorporaConfig will allow the provider to consider the `crossref_api_key` + # not None, so it will go ahead and do the mocked call. + + def make_response(content): + response = Response() + response.status_code = 200 + response._content = str.encode(json.dumps(content)) + return response + + body = { + "status": "ok", + "message": { + "author": [ + { + "given": "John", + "family": "Doe", + "sequence": "first", + }, + { + "given": "Jane", + "family": "Doe", + "sequence": "additional", + }, + ], + "published-online": {"date-parts": [[2021, 11, 10]]}, + "container-title": ["Nature"], + "subtype": "preprint", + "relation": { + "is-preprint-of": [ + {"id-type": "not_doi"}, + { + "id": "published_doi", + "id-type": "doi", + }, + ] + }, + }, + } + + provider = CrossrefProvider() + + with self.subTest("Published DOI is used when available"): + preprint_body = copy.deepcopy(body) + response_preprint = make_response(preprint_body) + + published_body = copy.deepcopy(body) + del published_body["message"]["subtype"] + published_body["message"]["author"][0]["given"] = "Jonathan" + response_published = make_response(published_body) + + responses = [response_published, response_preprint] + mock_get.side_effect = lambda *x, **y: responses.pop() + res, doi_curie_from_crossref = provider.fetch_metadata("preprint_doi") + self.assertEqual("published_doi", doi_curie_from_crossref) + expected_response = { + "authors": [{"given": "Jonathan", "family": "Doe"}, {"given": "Jane", "family": "Doe"}], + "published_year": 2021, + "published_month": 11, + "published_day": 10, + "published_at": 1636502400.0, + "journal": "Nature", + "is_preprint": False, + } + + self.assertDictEqual(expected_response, res) + + with self.subTest("Preprint DOI is used when published is referenced but cannot be retrieved") and patch.object( + provider, "fetch_published_metadata" + ) as fetch_published_metadata_mock: + fetch_published_metadata_mock.return_value = (None, None) + + preprint_body = copy.deepcopy(body) + response_preprint = make_response(preprint_body) + + published_body = copy.deepcopy(body) + del published_body["message"]["subtype"] + response_published = make_response(published_body) + + responses = [response_published, response_preprint] + mock_get.side_effect = lambda *x, **y: responses.pop() + res, doi_curie_from_crossref = provider.fetch_metadata("preprint_doi") + self.assertEqual("preprint_doi", doi_curie_from_crossref) + expected_response = { + "authors": [{"given": "John", "family": "Doe"}, {"given": "Jane", "family": "Doe"}], + "published_year": 2021, + "published_month": 11, + "published_day": 10, + "published_at": 1636502400.0, + "journal": "Nature", + "is_preprint": True, + } + + self.assertDictEqual(expected_response, res) + @patch("backend.layers.thirdparty.crossref_provider.requests.get") @patch("backend.layers.thirdparty.crossref_provider.CorporaConfig") def test__provider_parses_authors_and_dates_correctly(self, mock_config, mock_get): @@ -114,7 +214,7 @@ def test__provider_parses_authors_and_dates_correctly(self, mock_config, mock_ge mock_get.return_value = response provider = CrossrefProvider() - res = provider.fetch_metadata("test_doi") + res, _ = provider.fetch_metadata("test_doi") mock_get.assert_called_once() expected_response = { @@ -158,7 +258,7 @@ def test__provider_unescapes_journal_correctly(self, mock_config, mock_get): mock_get.return_value = response provider = CrossrefProvider() - res = provider.fetch_metadata("test_doi") + author_data, _ = provider.fetch_metadata("test_doi") mock_get.assert_called_once() expected_response = { @@ -173,7 +273,7 @@ def test__provider_unescapes_journal_correctly(self, mock_config, mock_get): "is_preprint": False, } - self.assertDictEqual(expected_response, res) + self.assertDictEqual(expected_response, author_data) @patch("backend.layers.thirdparty.crossref_provider.requests.get") @patch("backend.layers.thirdparty.crossref_provider.CorporaConfig") diff --git a/tests/unit/processing/test_process_validate.py b/tests/unit/processing/test_process_validate.py index d1431f272ad33..f8e577c2bf0dd 100644 --- a/tests/unit/processing/test_process_validate.py +++ b/tests/unit/processing/test_process_validate.py @@ -27,6 +27,7 @@ def test_process_download_validate_success(self, mock_read_h5ad): 6. upload the labeled file to S3 """ dropbox_uri = "https://www.dropbox.com/s/fake_location/test.h5ad?dl=0" + self.crossref_provider.fetch_metadata = Mock(return_value=({}, "12.2345")) collection = self.generate_unpublished_collection( links=[Link(name=None, type="DOI", uri="http://doi.org/12.2345")] @@ -49,7 +50,7 @@ def test_process_download_validate_success(self, mock_read_h5ad): pdv = ProcessValidate(self.business_logic, self.uri_provider, self.s3_provider, self.schema_validator) pdv.process(collection.version_id, dataset_version_id, "fake_bucket_name", "fake_datasets_bucket") citation_str = ( - f"Publication: http://doi.org/12.2345 " + f"Publication: https://doi.org/12.2345 " f"Dataset Version: http://domain/{dataset_version_id}.h5ad curated and distributed by " f"CZ CELLxGENE Discover in Collection: https://domain/collections/{collection.collection_id.id}" )