feat: annotate and persist citation in dataset artifacts, DB, and Dis…

…cover API (#5968)
chanzuckerberg · Oct 20, 2023 · ca448b2 · ca448b2
1 parent b7303ac
commit ca448b2
Show file tree

Hide file tree

Showing 11 changed files with 149 additions and 12 deletions.
diff --git a/.happy/terraform/modules/sfn/main.tf b/.happy/terraform/modules/sfn/main.tf
@@ -58,7 +58,11 @@ resource "aws_sfn_state_machine" "state_machine" {
               },
               {
                 "Name": "DATASET_ID",
-                 "Value.$": "$.dataset_id"
+                "Value.$": "$.dataset_id"
+              },
+              {
+                "Name": "COLLECTION_ID",
+                "Value.$": "$.collection_id"
               },
               {
                 "Name": "STEP_NAME",

diff --git a/backend/curation/api/curation-api.yml b/backend/curation/api/curation-api.yml
@@ -716,6 +716,14 @@ components:
         type: string
       nullable: true
       example: ["patient", "seqBatch"]
+    citation:
+      description: |
+        Citation that includes downloadable permalink to h5ad artifact for this dataset, a permalink to collection it 
+        belongs to in CZ CELLxGENE Discover, and--if applicable--the Publication DOI associated with the dataset.
+        See details about the exact format in the 
+        [schema definition](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#citation)
+      type: string
+      nullable: true
     collection_list:
       description: Collection metadata
       properties:
@@ -988,6 +996,8 @@ components:
           items:
             $ref: "#/components/schemas/ontology_element"
           type: array
+        citation:
+          $ref: "#/components/schemas/citation"
         dataset_id:
           $ref: "#/components/schemas/dataset_id"
         dataset_version_id:
@@ -1092,6 +1102,8 @@ components:
           items:
             $ref: "#/components/schemas/ontology_element"
           type: array
+        citation:
+          $ref: "#/components/schemas/citation"
         collection_doi:
           $ref: "#/components/schemas/doi"
         collection_id:
@@ -1189,6 +1201,8 @@ components:
           items:
             $ref: "#/components/schemas/ontology_element"
           type: array
+        citation:
+          $ref: "#/components/schemas/citation"
         collection_id:
           $ref: "#/components/schemas/collection_id"
         collection_version_id:

diff --git a/backend/curation/api/v1/curation/collections/common.py b/backend/curation/api/v1/curation/collections/common.py
@@ -322,6 +322,7 @@ class EntityColumns:
         "mean_genes_per_cell",
         "schema_version",
         "donor_id",
+        "citation",
     ]
 
     dataset_metadata_cols = [

diff --git a/backend/layers/common/entities.py b/backend/layers/common/entities.py
@@ -183,6 +183,7 @@ class DatasetMetadata:
     donor_id: List[str]
     is_primary_data: str
     x_approximate_distribution: Optional[str]
+    citation: Optional[str] = None
     default_embedding: Optional[str] = None
     embeddings: Optional[List[str]] = None
     feature_biotype: Optional[List[str]] = None

diff --git a/backend/layers/processing/process.py b/backend/layers/processing/process.py
@@ -5,6 +5,7 @@
 from backend.layers.business.business import BusinessLogic
 from backend.layers.business.business_interface import BusinessLogicInterface
 from backend.layers.common.entities import (
+    CollectionVersionId,
     DatasetConversionStatus,
     DatasetProcessingStatus,
     DatasetStatusKey,
@@ -89,6 +90,7 @@ def log_batch_environment(self):
 
     def process(
         self,
+        collection_id: Optional[CollectionVersionId],
         dataset_id: DatasetVersionId,
         step_name: str,
         dropbox_uri: Optional[str],
@@ -102,7 +104,9 @@ def process(
         self.logger.info(f"Processing dataset {dataset_id}")
         try:
             if step_name == "download-validate":
-                self.process_download_validate.process(dataset_id, dropbox_uri, artifact_bucket, datasets_bucket)
+                self.process_download_validate.process(
+                    collection_id, dataset_id, dropbox_uri, artifact_bucket, datasets_bucket
+                )
             elif step_name == "cxg":
                 self.process_cxg.process(dataset_id, artifact_bucket, cxg_bucket)
             elif step_name == "cxg_remaster":
@@ -149,12 +153,13 @@ def main(self):
             rv = self.schema_migrate.migrate(step_name)
         else:
             dataset_id = os.environ["DATASET_ID"]
-
+            collection_id = os.environ.get("COLLECTION_ID")
             dropbox_uri = os.environ.get("DROPBOX_URL")
             artifact_bucket = os.environ.get("ARTIFACT_BUCKET")
             datasets_bucket = os.environ.get("DATASETS_BUCKET")
             cxg_bucket = os.environ.get("CELLXGENE_BUCKET")
             rv = self.process(
+                collection_id=None if collection_id is None else CollectionVersionId(collection_id),
                 dataset_id=DatasetVersionId(dataset_id),
                 step_name=step_name,
                 dropbox_uri=dropbox_uri,

diff --git a/backend/layers/processing/process_download_validate.py b/backend/layers/processing/process_download_validate.py
@@ -7,6 +7,7 @@
 from backend.common.utils.corpora_constants import CorporaConstants
 from backend.layers.business.business_interface import BusinessLogicInterface
 from backend.layers.common.entities import (
+    CollectionVersionId,
     DatasetArtifactType,
     DatasetConversionStatus,
     DatasetMetadata,
@@ -60,7 +61,9 @@ def __init__(
         self.schema_validator = schema_validator
 
     @logit
-    def validate_h5ad_file_and_add_labels(self, dataset_id: DatasetVersionId, local_filename: str) -> Tuple[str, bool]:
+    def validate_h5ad_file_and_add_labels(
+        self, collection_id: CollectionVersionId, dataset_id: DatasetVersionId, local_filename: str
+    ) -> Tuple[str, bool]:
         """
         Validates and labels the specified dataset file and updates the processing status in the database
         :param dataset_id: ID of the dataset to update
@@ -83,11 +86,40 @@ def validate_h5ad_file_and_add_labels(self, dataset_id: DatasetVersionId, local_
         if not is_valid:
             raise ValidationFailed(errors)
         else:
+            if CorporaConfig().schema_4_feature_flag.lower() == "true":
+                self.populate_dataset_citation(collection_id, dataset_id, output_filename)
+
             # TODO: optionally, these could be batched into one
             self.update_processing_status(dataset_id, DatasetStatusKey.H5AD, DatasetConversionStatus.CONVERTED)
             self.update_processing_status(dataset_id, DatasetStatusKey.VALIDATION, DatasetValidationStatus.VALID)
             return output_filename, can_convert_to_seurat
 
+    def populate_dataset_citation(
+        self, collection_id: CollectionVersionId, dataset_id: DatasetVersionId, adata_path: str
+    ) -> None:
+        """
+        Builds citation string and updates the 'uns' dict of the adata at adata_path
+
+        :param collection_id: version ID for collection dataset is being uploaded to
+        :param dataset_id: version ID for dataset
+        :param adata_path: filepath to adata object that will be updated with citation
+        """
+        dataset_assets_base_url = CorporaConfig().dataset_assets_base_url
+        collections_base_url = CorporaConfig().collections_base_url
+        citation = ""
+        collection = self.business_logic.get_collection_version(collection_id)
+        doi = next((link.uri for link in collection.metadata.links if link.type == "DOI"), None)
+        if doi:
+            citation += f"Publication: {doi} "
+        citation += f"Dataset Version: {dataset_assets_base_url}/{dataset_id}.h5ad "
+        citation += (
+            f"curated and distributed by CZ CELLxGENE Discover in Collection: "
+            f"{collections_base_url}/{collection_id}"
+        )
+        adata = scanpy.read_h5ad(adata_path)
+        adata.uns["citation"] = citation
+        adata.write(adata_path)
+
     @logit
     def extract_metadata(self, filename) -> DatasetMetadata:
         """Pull metadata out of the AnnData file to insert into the dataset table."""
@@ -175,6 +207,7 @@ def _get_batch_condition() -> Optional[str]:
             default_embedding=adata.uns.get("default_embedding"),
             embeddings=adata.obsm_keys(),
             raw_data_location="raw.X" if adata.raw else "X",
+            citation=adata.uns.get("citation"),
         )
 
     def wrapped_download_from_s3(
@@ -233,12 +266,20 @@ def remove_prefix(self, string: str, prefix: str) -> str:
         else:
             return string[:]
 
-    def process(self, dataset_id: DatasetVersionId, dropbox_url: str, artifact_bucket: str, datasets_bucket: str):
+    def process(
+        self,
+        collection_id: CollectionVersionId,
+        dataset_id: DatasetVersionId,
+        dropbox_url: str,
+        artifact_bucket: str,
+        datasets_bucket: str,
+    ):
         """
         1. Download the original dataset from Dropbox
         2. Validate and label it
         3. Upload the labeled dataset to the artifact bucket
         4. Upload the labeled dataset to the datasets bucket
+        :param collection_id
         :param dataset_id:
         :param dropbox_url:
         :param artifact_bucket:
@@ -256,7 +297,9 @@ def process(self, dataset_id: DatasetVersionId, dropbox_url: str, artifact_bucke
         )
 
         # Validate and label the dataset
-        file_with_labels, can_convert_to_seurat = self.validate_h5ad_file_and_add_labels(dataset_id, local_filename)
+        file_with_labels, can_convert_to_seurat = self.validate_h5ad_file_and_add_labels(
+            collection_id, dataset_id, local_filename
+        )
         # Process metadata
         metadata = self.extract_metadata(file_with_labels)
         self.business_logic.set_dataset_metadata(dataset_id, metadata)

diff --git a/tests/unit/backend/layers/api/test_curation_api.py b/tests/unit/backend/layers/api/test_curation_api.py
@@ -1024,6 +1024,10 @@ def test_get_collection_version_ok(self):
                     "cell_count": 10,
                     "primary_cell_count": 5,
                     "cell_type": [{"label": "test_cell_type_label", "ontology_term_id": "test_cell_type_term_id"}],
+                    "citation": "Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: "
+                    "https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by "
+                    "CZ CELLxGENE Discover in Collection: "
+                    "https://cellxgene.cziscience.com/collections/collection_id",
                     "dataset_id": f"{first_version.datasets[0].dataset_id.id}",
                     "dataset_version_id": f"{first_version.datasets[0].version_id.id}",
                     "default_embedding": "X_embedding_1",

diff --git a/tests/unit/backend/layers/business/test_business.py b/tests/unit/backend/layers/business/test_business.py
@@ -162,6 +162,10 @@ def mock_config_fn(name):
             feature_count=400,
             feature_reference=["NCBITaxon:9606"],
             raw_data_location="raw.X",
+            citation="Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: "
+            "https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by "
+            "CZ CELLxGENE Discover in Collection: "
+            "https://cellxgene.cziscience.com/collections/collection_id",
         )
         self.s3_provider.mock_s3_fs = set()
 

diff --git a/tests/unit/backend/layers/common/base_test.py b/tests/unit/backend/layers/common/base_test.py
@@ -143,6 +143,10 @@ def mock_config_fn(name):
             feature_count=400,
             feature_reference=["NCBITaxon:9606"],
             raw_data_location="raw.X",
+            citation="Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: "
+            "https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by "
+            "CZ CELLxGENE Discover in Collection: "
+            "https://cellxgene.cziscience.com/collections/collection_id",
         )
 
         self.sample_collection_metadata = CollectionMetadata(

diff --git a/tests/unit/processing/test_extract_metadata.py b/tests/unit/processing/test_extract_metadata.py
@@ -92,6 +92,10 @@ def test_extract_metadata(self, mock_read_h5ad):
             "batch_condition": np.array({"batchA", "batchB"}),
             "schema_version": "3.0.0",
             "default_embedding": "X_umap",
+            "citation": "Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: "
+            "https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by "
+            "CZ CELLxGENE Discover in Collection: "
+            "https://cellxgene.cziscience.com/collections/collection_id",
         }
 
         var = pandas.DataFrame(
@@ -166,6 +170,7 @@ def test_extract_metadata(self, mock_read_h5ad):
         self.assertEqual(extracted_metadata.x_approximate_distribution, "NORMAL")
         self.assertEqual(extracted_metadata.batch_condition, np.array({"batchA", "batchB"}))
         self.assertEqual(extracted_metadata.schema_version, "3.0.0")
+        self.assertEqual(extracted_metadata.citation, uns["citation"])
 
         self.assertEqual(extracted_metadata.cell_count, 50001)
         self.assertEqual(extracted_metadata.primary_cell_count, 0)
@@ -249,6 +254,10 @@ def test_extract_metadata_find_raw_layer(self, mock_read_h5ad):
             "X_approximate_distribution": "normal",
             "batch_condition": np.array({"batchA", "batchB"}),
             "schema_version": "3.0.0",
+            "citation": "Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: "
+            "https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by "
+            "CZ CELLxGENE Discover in Collection: "
+            "https://cellxgene.cziscience.com/collections/collection_id",
         }
 
         var = pandas.DataFrame(