diff --git a/.happy/terraform/modules/sfn/main.tf b/.happy/terraform/modules/sfn/main.tf index e7da0ef465c15..c152b468316aa 100644 --- a/.happy/terraform/modules/sfn/main.tf +++ b/.happy/terraform/modules/sfn/main.tf @@ -58,7 +58,11 @@ resource "aws_sfn_state_machine" "state_machine" { }, { "Name": "DATASET_ID", - "Value.$": "$.dataset_id" + "Value.$": "$.dataset_id" + }, + { + "Name": "COLLECTION_ID", + "Value.$": "$.collection_id" }, { "Name": "STEP_NAME", diff --git a/backend/curation/api/curation-api.yml b/backend/curation/api/curation-api.yml index bdc7364e27f7b..10330175a0fee 100644 --- a/backend/curation/api/curation-api.yml +++ b/backend/curation/api/curation-api.yml @@ -716,6 +716,14 @@ components: type: string nullable: true example: ["patient", "seqBatch"] + citation: + description: | + Citation that includes downloadable permalink to h5ad artifact for this dataset, a permalink to collection it + belongs to in CZ CELLxGENE Discover, and--if applicable--the Publication DOI associated with the dataset. + See details about the exact format in the + [schema definition](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#citation) + type: string + nullable: true collection_list: description: Collection metadata properties: @@ -988,6 +996,8 @@ components: items: $ref: "#/components/schemas/ontology_element" type: array + citation: + $ref: "#/components/schemas/citation" dataset_id: $ref: "#/components/schemas/dataset_id" dataset_version_id: @@ -1092,6 +1102,8 @@ components: items: $ref: "#/components/schemas/ontology_element" type: array + citation: + $ref: "#/components/schemas/citation" collection_doi: $ref: "#/components/schemas/doi" collection_id: @@ -1189,6 +1201,8 @@ components: items: $ref: "#/components/schemas/ontology_element" type: array + citation: + $ref: "#/components/schemas/citation" collection_id: $ref: "#/components/schemas/collection_id" collection_version_id: diff --git a/backend/curation/api/v1/curation/collections/common.py b/backend/curation/api/v1/curation/collections/common.py index a710c9e643aa9..dd99da822d7e1 100644 --- a/backend/curation/api/v1/curation/collections/common.py +++ b/backend/curation/api/v1/curation/collections/common.py @@ -322,6 +322,7 @@ class EntityColumns: "mean_genes_per_cell", "schema_version", "donor_id", + "citation", ] dataset_metadata_cols = [ diff --git a/backend/layers/common/entities.py b/backend/layers/common/entities.py index abfc47e371d99..eedf453d3ca23 100644 --- a/backend/layers/common/entities.py +++ b/backend/layers/common/entities.py @@ -183,6 +183,7 @@ class DatasetMetadata: donor_id: List[str] is_primary_data: str x_approximate_distribution: Optional[str] + citation: Optional[str] = None default_embedding: Optional[str] = None embeddings: Optional[List[str]] = None feature_biotype: Optional[List[str]] = None diff --git a/backend/layers/processing/process.py b/backend/layers/processing/process.py index ff2f61322e36a..b1e7a560c8d7d 100644 --- a/backend/layers/processing/process.py +++ b/backend/layers/processing/process.py @@ -5,6 +5,7 @@ from backend.layers.business.business import BusinessLogic from backend.layers.business.business_interface import BusinessLogicInterface from backend.layers.common.entities import ( + CollectionVersionId, DatasetConversionStatus, DatasetProcessingStatus, DatasetStatusKey, @@ -89,6 +90,7 @@ def log_batch_environment(self): def process( self, + collection_id: Optional[CollectionVersionId], dataset_id: DatasetVersionId, step_name: str, dropbox_uri: Optional[str], @@ -102,7 +104,9 @@ def process( self.logger.info(f"Processing dataset {dataset_id}") try: if step_name == "download-validate": - self.process_download_validate.process(dataset_id, dropbox_uri, artifact_bucket, datasets_bucket) + self.process_download_validate.process( + collection_id, dataset_id, dropbox_uri, artifact_bucket, datasets_bucket + ) elif step_name == "cxg": self.process_cxg.process(dataset_id, artifact_bucket, cxg_bucket) elif step_name == "cxg_remaster": @@ -149,12 +153,13 @@ def main(self): rv = self.schema_migrate.migrate(step_name) else: dataset_id = os.environ["DATASET_ID"] - + collection_id = os.environ.get("COLLECTION_ID") dropbox_uri = os.environ.get("DROPBOX_URL") artifact_bucket = os.environ.get("ARTIFACT_BUCKET") datasets_bucket = os.environ.get("DATASETS_BUCKET") cxg_bucket = os.environ.get("CELLXGENE_BUCKET") rv = self.process( + collection_id=None if collection_id is None else CollectionVersionId(collection_id), dataset_id=DatasetVersionId(dataset_id), step_name=step_name, dropbox_uri=dropbox_uri, diff --git a/backend/layers/processing/process_download_validate.py b/backend/layers/processing/process_download_validate.py index 2d0c4c65fb824..e51d2aa428a15 100644 --- a/backend/layers/processing/process_download_validate.py +++ b/backend/layers/processing/process_download_validate.py @@ -7,6 +7,7 @@ from backend.common.utils.corpora_constants import CorporaConstants from backend.layers.business.business_interface import BusinessLogicInterface from backend.layers.common.entities import ( + CollectionVersionId, DatasetArtifactType, DatasetConversionStatus, DatasetMetadata, @@ -60,7 +61,9 @@ def __init__( self.schema_validator = schema_validator @logit - def validate_h5ad_file_and_add_labels(self, dataset_id: DatasetVersionId, local_filename: str) -> Tuple[str, bool]: + def validate_h5ad_file_and_add_labels( + self, collection_id: CollectionVersionId, dataset_id: DatasetVersionId, local_filename: str + ) -> Tuple[str, bool]: """ Validates and labels the specified dataset file and updates the processing status in the database :param dataset_id: ID of the dataset to update @@ -83,11 +86,40 @@ def validate_h5ad_file_and_add_labels(self, dataset_id: DatasetVersionId, local_ if not is_valid: raise ValidationFailed(errors) else: + if CorporaConfig().schema_4_feature_flag.lower() == "true": + self.populate_dataset_citation(collection_id, dataset_id, output_filename) + # TODO: optionally, these could be batched into one self.update_processing_status(dataset_id, DatasetStatusKey.H5AD, DatasetConversionStatus.CONVERTED) self.update_processing_status(dataset_id, DatasetStatusKey.VALIDATION, DatasetValidationStatus.VALID) return output_filename, can_convert_to_seurat + def populate_dataset_citation( + self, collection_id: CollectionVersionId, dataset_id: DatasetVersionId, adata_path: str + ) -> None: + """ + Builds citation string and updates the 'uns' dict of the adata at adata_path + + :param collection_id: version ID for collection dataset is being uploaded to + :param dataset_id: version ID for dataset + :param adata_path: filepath to adata object that will be updated with citation + """ + dataset_assets_base_url = CorporaConfig().dataset_assets_base_url + collections_base_url = CorporaConfig().collections_base_url + citation = "" + collection = self.business_logic.get_collection_version(collection_id) + doi = next((link.uri for link in collection.metadata.links if link.type == "DOI"), None) + if doi: + citation += f"Publication: {doi} " + citation += f"Dataset Version: {dataset_assets_base_url}/{dataset_id}.h5ad " + citation += ( + f"curated and distributed by CZ CELLxGENE Discover in Collection: " + f"{collections_base_url}/{collection_id}" + ) + adata = scanpy.read_h5ad(adata_path) + adata.uns["citation"] = citation + adata.write(adata_path) + @logit def extract_metadata(self, filename) -> DatasetMetadata: """Pull metadata out of the AnnData file to insert into the dataset table.""" @@ -175,6 +207,7 @@ def _get_batch_condition() -> Optional[str]: default_embedding=adata.uns.get("default_embedding"), embeddings=adata.obsm_keys(), raw_data_location="raw.X" if adata.raw else "X", + citation=adata.uns.get("citation"), ) def wrapped_download_from_s3( @@ -233,12 +266,20 @@ def remove_prefix(self, string: str, prefix: str) -> str: else: return string[:] - def process(self, dataset_id: DatasetVersionId, dropbox_url: str, artifact_bucket: str, datasets_bucket: str): + def process( + self, + collection_id: CollectionVersionId, + dataset_id: DatasetVersionId, + dropbox_url: str, + artifact_bucket: str, + datasets_bucket: str, + ): """ 1. Download the original dataset from Dropbox 2. Validate and label it 3. Upload the labeled dataset to the artifact bucket 4. Upload the labeled dataset to the datasets bucket + :param collection_id :param dataset_id: :param dropbox_url: :param artifact_bucket: @@ -256,7 +297,9 @@ def process(self, dataset_id: DatasetVersionId, dropbox_url: str, artifact_bucke ) # Validate and label the dataset - file_with_labels, can_convert_to_seurat = self.validate_h5ad_file_and_add_labels(dataset_id, local_filename) + file_with_labels, can_convert_to_seurat = self.validate_h5ad_file_and_add_labels( + collection_id, dataset_id, local_filename + ) # Process metadata metadata = self.extract_metadata(file_with_labels) self.business_logic.set_dataset_metadata(dataset_id, metadata) diff --git a/tests/unit/backend/layers/api/test_curation_api.py b/tests/unit/backend/layers/api/test_curation_api.py index e36152fc3bb53..8b125df670d13 100644 --- a/tests/unit/backend/layers/api/test_curation_api.py +++ b/tests/unit/backend/layers/api/test_curation_api.py @@ -1024,6 +1024,10 @@ def test_get_collection_version_ok(self): "cell_count": 10, "primary_cell_count": 5, "cell_type": [{"label": "test_cell_type_label", "ontology_term_id": "test_cell_type_term_id"}], + "citation": "Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: " + "https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by " + "CZ CELLxGENE Discover in Collection: " + "https://cellxgene.cziscience.com/collections/collection_id", "dataset_id": f"{first_version.datasets[0].dataset_id.id}", "dataset_version_id": f"{first_version.datasets[0].version_id.id}", "default_embedding": "X_embedding_1", diff --git a/tests/unit/backend/layers/business/test_business.py b/tests/unit/backend/layers/business/test_business.py index 2912f2685ff1a..4e4d46c814cc0 100644 --- a/tests/unit/backend/layers/business/test_business.py +++ b/tests/unit/backend/layers/business/test_business.py @@ -162,6 +162,10 @@ def mock_config_fn(name): feature_count=400, feature_reference=["NCBITaxon:9606"], raw_data_location="raw.X", + citation="Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: " + "https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by " + "CZ CELLxGENE Discover in Collection: " + "https://cellxgene.cziscience.com/collections/collection_id", ) self.s3_provider.mock_s3_fs = set() diff --git a/tests/unit/backend/layers/common/base_test.py b/tests/unit/backend/layers/common/base_test.py index 80548acfb35c0..58179f659a0ff 100644 --- a/tests/unit/backend/layers/common/base_test.py +++ b/tests/unit/backend/layers/common/base_test.py @@ -143,6 +143,10 @@ def mock_config_fn(name): feature_count=400, feature_reference=["NCBITaxon:9606"], raw_data_location="raw.X", + citation="Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: " + "https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by " + "CZ CELLxGENE Discover in Collection: " + "https://cellxgene.cziscience.com/collections/collection_id", ) self.sample_collection_metadata = CollectionMetadata( diff --git a/tests/unit/processing/test_extract_metadata.py b/tests/unit/processing/test_extract_metadata.py index c7c4b9492392e..c17820cb1325f 100644 --- a/tests/unit/processing/test_extract_metadata.py +++ b/tests/unit/processing/test_extract_metadata.py @@ -92,6 +92,10 @@ def test_extract_metadata(self, mock_read_h5ad): "batch_condition": np.array({"batchA", "batchB"}), "schema_version": "3.0.0", "default_embedding": "X_umap", + "citation": "Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: " + "https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by " + "CZ CELLxGENE Discover in Collection: " + "https://cellxgene.cziscience.com/collections/collection_id", } var = pandas.DataFrame( @@ -166,6 +170,7 @@ def test_extract_metadata(self, mock_read_h5ad): self.assertEqual(extracted_metadata.x_approximate_distribution, "NORMAL") self.assertEqual(extracted_metadata.batch_condition, np.array({"batchA", "batchB"})) self.assertEqual(extracted_metadata.schema_version, "3.0.0") + self.assertEqual(extracted_metadata.citation, uns["citation"]) self.assertEqual(extracted_metadata.cell_count, 50001) self.assertEqual(extracted_metadata.primary_cell_count, 0) @@ -249,6 +254,10 @@ def test_extract_metadata_find_raw_layer(self, mock_read_h5ad): "X_approximate_distribution": "normal", "batch_condition": np.array({"batchA", "batchB"}), "schema_version": "3.0.0", + "citation": "Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: " + "https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by " + "CZ CELLxGENE Discover in Collection: " + "https://cellxgene.cziscience.com/collections/collection_id", } var = pandas.DataFrame( diff --git a/tests/unit/processing/test_processing.py b/tests/unit/processing/test_processing.py index 676a6aabae55e..7f6c212009fd1 100644 --- a/tests/unit/processing/test_processing.py +++ b/tests/unit/processing/test_processing.py @@ -1,4 +1,4 @@ -from unittest.mock import Mock, patch +from unittest.mock import MagicMock, Mock, patch from backend.layers.common.entities import ( DatasetArtifactType, @@ -6,6 +6,8 @@ DatasetProcessingStatus, DatasetUploadStatus, DatasetValidationStatus, + DatasetVersionId, + Link, ) from backend.layers.processing.process import ProcessMain from backend.layers.processing.process_cxg import ProcessCxg @@ -13,9 +15,22 @@ from backend.layers.processing.process_seurat import ProcessSeurat from tests.unit.processing.base_processing_test import BaseProcessingTest +mock_config_attr = { + "collections_base_url": "http://collections", + "dataset_assets_base_url": "http://domain", + "upload_max_file_size_gb": 1, + "schema_4_feature_flag": "True", +} + + +def mock_config_fn(name): + return mock_config_attr[name] + class ProcessingTest(BaseProcessingTest): - def test_process_download_validate_success(self): + @patch("scanpy.read_h5ad") + @patch("backend.common.corpora_config.CorporaConfig.__getattr__", side_effect=mock_config_fn) + def test_process_download_validate_success(self, mock_config, mock_read_h5ad): """ ProcessDownloadValidate should: 1. Download the h5ad artifact @@ -27,7 +42,9 @@ def test_process_download_validate_success(self): """ dropbox_uri = "https://www.dropbox.com/s/fake_location/test.h5ad?dl=0" - collection = self.generate_unpublished_collection() + collection = self.generate_unpublished_collection( + links=[Link(name=None, type="DOI", uri="http://doi.org/12.2345")] + ) dataset_version_id, dataset_id = self.business_logic.ingest_dataset( collection.version_id, dropbox_uri, None, None ) @@ -39,13 +56,22 @@ def test_process_download_validate_success(self): self.assertEqual(status.processing_status, DatasetProcessingStatus.INITIALIZED) self.assertEqual(status.upload_status, DatasetUploadStatus.WAITING) + mock_read_h5ad.return_value = MagicMock(uns=dict()) + # TODO: ideally use a real h5ad so that with patch("backend.layers.processing.process_download_validate.ProcessDownloadValidate.extract_metadata"): pdv = ProcessDownloadValidate( self.business_logic, self.uri_provider, self.s3_provider, self.downloader, self.schema_validator ) - pdv.process(dataset_version_id, dropbox_uri, "fake_bucket_name", "fake_datasets_bucket") - + pdv.process( + collection.version_id, dataset_version_id, dropbox_uri, "fake_bucket_name", "fake_datasets_bucket" + ) + citation_str = ( + f"Publication: http://doi.org/12.2345 " + f"Dataset Version: http://domain/{dataset_version_id}.h5ad curated and distributed by " + f"CZ CELLxGENE Discover in Collection: http://collections/{collection.version_id}" + ) + self.assertEqual(mock_read_h5ad.return_value.uns["citation"], citation_str) status = self.business_logic.get_dataset_status(dataset_version_id) self.assertEqual(status.validation_status, DatasetValidationStatus.VALID) self.assertEqual(status.upload_status, DatasetUploadStatus.UPLOADED) @@ -63,6 +89,23 @@ def test_process_download_validate_success(self): artifact.type = DatasetArtifactType.H5AD artifact.uri = f"s3://fake_bucket_name/{dataset_version_id.id}/local.h5ad" + @patch("scanpy.read_h5ad") + @patch("backend.common.corpora_config.CorporaConfig.__getattr__", side_effect=mock_config_fn) + def test_populate_dataset_citation__no_publication_doi(self, mock_config, mock_read_h5ad): + mock_read_h5ad.return_value = MagicMock(uns=dict()) + collection = self.generate_unpublished_collection() + + pdv = ProcessDownloadValidate( + self.business_logic, self.uri_provider, self.s3_provider, self.downloader, self.schema_validator + ) + dataset_version_id = DatasetVersionId() + pdv.populate_dataset_citation(collection.version_id, dataset_version_id, "") + citation_str = ( + f"Dataset Version: http://domain/{dataset_version_id}.h5ad curated and distributed by " + f"CZ CELLxGENE Discover in Collection: http://collections/{collection.version_id}" + ) + self.assertEqual(mock_read_h5ad.return_value.uns["citation"], citation_str) + def test_process_seurat_success(self): collection = self.generate_unpublished_collection() dataset_version_id, dataset_id = self.business_logic.ingest_dataset( @@ -138,10 +181,12 @@ def test_reprocess_cxg_success(self): cxg_artifact = [artifact for artifact in artifacts if artifact.type == "cxg"][0] self.assertTrue(cxg_artifact, f"s3://fake_cxg_bucket/{dataset_version_id.id}.cxg/") + @patch("scanpy.read_h5ad") + @patch("backend.common.corpora_config.CorporaConfig.__getattr__", side_effect=mock_config_fn) @patch("backend.layers.processing.process_download_validate.ProcessDownloadValidate.extract_metadata") @patch("backend.layers.processing.process_seurat.ProcessSeurat.make_seurat") @patch("backend.layers.processing.process_cxg.ProcessCxg.make_cxg") - def test_process_all(self, mock_cxg, mock_seurat, mock_h5ad): + def test_process_all(self, mock_cxg, mock_seurat, mock_h5ad, mock_config, mock_read_h5ad): mock_seurat.return_value = "local.rds" mock_cxg.return_value = "local.cxg" dropbox_uri = "https://www.dropbox.com/s/ow84zm4h0wkl409/test.h5ad?dl=0" @@ -149,12 +194,14 @@ def test_process_all(self, mock_cxg, mock_seurat, mock_h5ad): dataset_version_id, dataset_id = self.business_logic.ingest_dataset( collection.version_id, dropbox_uri, None, None ) + mock_read_h5ad.return_value = MagicMock(uns=dict()) pm = ProcessMain( self.business_logic, self.uri_provider, self.s3_provider, self.downloader, self.schema_validator ) for step_name in ["download-validate", "cxg", "seurat"]: pm.process( + collection.version_id, dataset_version_id, step_name, dropbox_uri, @@ -210,6 +257,7 @@ def test_process_all_download_validate_fail(self): for step_name in ["download-validate"]: pm.process( + collection.version_id, dataset_version_id, step_name, dropbox_uri,