Skip to content

Commit

Permalink
feat: annotate and persist citation in dataset artifacts, DB, and Dis…
Browse files Browse the repository at this point in the history
…cover API (#5968)
  • Loading branch information
nayib-jose-gloria authored Oct 20, 2023
1 parent b7303ac commit ca448b2
Show file tree
Hide file tree
Showing 11 changed files with 149 additions and 12 deletions.
6 changes: 5 additions & 1 deletion .happy/terraform/modules/sfn/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,11 @@ resource "aws_sfn_state_machine" "state_machine" {
},
{
"Name": "DATASET_ID",
"Value.$": "$.dataset_id"
"Value.$": "$.dataset_id"
},
{
"Name": "COLLECTION_ID",
"Value.$": "$.collection_id"
},
{
"Name": "STEP_NAME",
Expand Down
14 changes: 14 additions & 0 deletions backend/curation/api/curation-api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,14 @@ components:
type: string
nullable: true
example: ["patient", "seqBatch"]
citation:
description: |
Citation that includes downloadable permalink to h5ad artifact for this dataset, a permalink to collection it
belongs to in CZ CELLxGENE Discover, and--if applicable--the Publication DOI associated with the dataset.
See details about the exact format in the
[schema definition](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#citation)
type: string
nullable: true
collection_list:
description: Collection metadata
properties:
Expand Down Expand Up @@ -988,6 +996,8 @@ components:
items:
$ref: "#/components/schemas/ontology_element"
type: array
citation:
$ref: "#/components/schemas/citation"
dataset_id:
$ref: "#/components/schemas/dataset_id"
dataset_version_id:
Expand Down Expand Up @@ -1092,6 +1102,8 @@ components:
items:
$ref: "#/components/schemas/ontology_element"
type: array
citation:
$ref: "#/components/schemas/citation"
collection_doi:
$ref: "#/components/schemas/doi"
collection_id:
Expand Down Expand Up @@ -1189,6 +1201,8 @@ components:
items:
$ref: "#/components/schemas/ontology_element"
type: array
citation:
$ref: "#/components/schemas/citation"
collection_id:
$ref: "#/components/schemas/collection_id"
collection_version_id:
Expand Down
1 change: 1 addition & 0 deletions backend/curation/api/v1/curation/collections/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ class EntityColumns:
"mean_genes_per_cell",
"schema_version",
"donor_id",
"citation",
]

dataset_metadata_cols = [
Expand Down
1 change: 1 addition & 0 deletions backend/layers/common/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ class DatasetMetadata:
donor_id: List[str]
is_primary_data: str
x_approximate_distribution: Optional[str]
citation: Optional[str] = None
default_embedding: Optional[str] = None
embeddings: Optional[List[str]] = None
feature_biotype: Optional[List[str]] = None
Expand Down
9 changes: 7 additions & 2 deletions backend/layers/processing/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from backend.layers.business.business import BusinessLogic
from backend.layers.business.business_interface import BusinessLogicInterface
from backend.layers.common.entities import (
CollectionVersionId,
DatasetConversionStatus,
DatasetProcessingStatus,
DatasetStatusKey,
Expand Down Expand Up @@ -89,6 +90,7 @@ def log_batch_environment(self):

def process(
self,
collection_id: Optional[CollectionVersionId],
dataset_id: DatasetVersionId,
step_name: str,
dropbox_uri: Optional[str],
Expand All @@ -102,7 +104,9 @@ def process(
self.logger.info(f"Processing dataset {dataset_id}")
try:
if step_name == "download-validate":
self.process_download_validate.process(dataset_id, dropbox_uri, artifact_bucket, datasets_bucket)
self.process_download_validate.process(
collection_id, dataset_id, dropbox_uri, artifact_bucket, datasets_bucket
)
elif step_name == "cxg":
self.process_cxg.process(dataset_id, artifact_bucket, cxg_bucket)
elif step_name == "cxg_remaster":
Expand Down Expand Up @@ -149,12 +153,13 @@ def main(self):
rv = self.schema_migrate.migrate(step_name)
else:
dataset_id = os.environ["DATASET_ID"]

collection_id = os.environ.get("COLLECTION_ID")
dropbox_uri = os.environ.get("DROPBOX_URL")
artifact_bucket = os.environ.get("ARTIFACT_BUCKET")
datasets_bucket = os.environ.get("DATASETS_BUCKET")
cxg_bucket = os.environ.get("CELLXGENE_BUCKET")
rv = self.process(
collection_id=None if collection_id is None else CollectionVersionId(collection_id),
dataset_id=DatasetVersionId(dataset_id),
step_name=step_name,
dropbox_uri=dropbox_uri,
Expand Down
49 changes: 46 additions & 3 deletions backend/layers/processing/process_download_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from backend.common.utils.corpora_constants import CorporaConstants
from backend.layers.business.business_interface import BusinessLogicInterface
from backend.layers.common.entities import (
CollectionVersionId,
DatasetArtifactType,
DatasetConversionStatus,
DatasetMetadata,
Expand Down Expand Up @@ -60,7 +61,9 @@ def __init__(
self.schema_validator = schema_validator

@logit
def validate_h5ad_file_and_add_labels(self, dataset_id: DatasetVersionId, local_filename: str) -> Tuple[str, bool]:
def validate_h5ad_file_and_add_labels(
self, collection_id: CollectionVersionId, dataset_id: DatasetVersionId, local_filename: str
) -> Tuple[str, bool]:
"""
Validates and labels the specified dataset file and updates the processing status in the database
:param dataset_id: ID of the dataset to update
Expand All @@ -83,11 +86,40 @@ def validate_h5ad_file_and_add_labels(self, dataset_id: DatasetVersionId, local_
if not is_valid:
raise ValidationFailed(errors)
else:
if CorporaConfig().schema_4_feature_flag.lower() == "true":
self.populate_dataset_citation(collection_id, dataset_id, output_filename)

# TODO: optionally, these could be batched into one
self.update_processing_status(dataset_id, DatasetStatusKey.H5AD, DatasetConversionStatus.CONVERTED)
self.update_processing_status(dataset_id, DatasetStatusKey.VALIDATION, DatasetValidationStatus.VALID)
return output_filename, can_convert_to_seurat

def populate_dataset_citation(
self, collection_id: CollectionVersionId, dataset_id: DatasetVersionId, adata_path: str
) -> None:
"""
Builds citation string and updates the 'uns' dict of the adata at adata_path
:param collection_id: version ID for collection dataset is being uploaded to
:param dataset_id: version ID for dataset
:param adata_path: filepath to adata object that will be updated with citation
"""
dataset_assets_base_url = CorporaConfig().dataset_assets_base_url
collections_base_url = CorporaConfig().collections_base_url
citation = ""
collection = self.business_logic.get_collection_version(collection_id)
doi = next((link.uri for link in collection.metadata.links if link.type == "DOI"), None)
if doi:
citation += f"Publication: {doi} "
citation += f"Dataset Version: {dataset_assets_base_url}/{dataset_id}.h5ad "
citation += (
f"curated and distributed by CZ CELLxGENE Discover in Collection: "
f"{collections_base_url}/{collection_id}"
)
adata = scanpy.read_h5ad(adata_path)
adata.uns["citation"] = citation
adata.write(adata_path)

@logit
def extract_metadata(self, filename) -> DatasetMetadata:
"""Pull metadata out of the AnnData file to insert into the dataset table."""
Expand Down Expand Up @@ -175,6 +207,7 @@ def _get_batch_condition() -> Optional[str]:
default_embedding=adata.uns.get("default_embedding"),
embeddings=adata.obsm_keys(),
raw_data_location="raw.X" if adata.raw else "X",
citation=adata.uns.get("citation"),
)

def wrapped_download_from_s3(
Expand Down Expand Up @@ -233,12 +266,20 @@ def remove_prefix(self, string: str, prefix: str) -> str:
else:
return string[:]

def process(self, dataset_id: DatasetVersionId, dropbox_url: str, artifact_bucket: str, datasets_bucket: str):
def process(
self,
collection_id: CollectionVersionId,
dataset_id: DatasetVersionId,
dropbox_url: str,
artifact_bucket: str,
datasets_bucket: str,
):
"""
1. Download the original dataset from Dropbox
2. Validate and label it
3. Upload the labeled dataset to the artifact bucket
4. Upload the labeled dataset to the datasets bucket
:param collection_id
:param dataset_id:
:param dropbox_url:
:param artifact_bucket:
Expand All @@ -256,7 +297,9 @@ def process(self, dataset_id: DatasetVersionId, dropbox_url: str, artifact_bucke
)

# Validate and label the dataset
file_with_labels, can_convert_to_seurat = self.validate_h5ad_file_and_add_labels(dataset_id, local_filename)
file_with_labels, can_convert_to_seurat = self.validate_h5ad_file_and_add_labels(
collection_id, dataset_id, local_filename
)
# Process metadata
metadata = self.extract_metadata(file_with_labels)
self.business_logic.set_dataset_metadata(dataset_id, metadata)
Expand Down
4 changes: 4 additions & 0 deletions tests/unit/backend/layers/api/test_curation_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1024,6 +1024,10 @@ def test_get_collection_version_ok(self):
"cell_count": 10,
"primary_cell_count": 5,
"cell_type": [{"label": "test_cell_type_label", "ontology_term_id": "test_cell_type_term_id"}],
"citation": "Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: "
"https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by "
"CZ CELLxGENE Discover in Collection: "
"https://cellxgene.cziscience.com/collections/collection_id",
"dataset_id": f"{first_version.datasets[0].dataset_id.id}",
"dataset_version_id": f"{first_version.datasets[0].version_id.id}",
"default_embedding": "X_embedding_1",
Expand Down
4 changes: 4 additions & 0 deletions tests/unit/backend/layers/business/test_business.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,10 @@ def mock_config_fn(name):
feature_count=400,
feature_reference=["NCBITaxon:9606"],
raw_data_location="raw.X",
citation="Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: "
"https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by "
"CZ CELLxGENE Discover in Collection: "
"https://cellxgene.cziscience.com/collections/collection_id",
)
self.s3_provider.mock_s3_fs = set()

Expand Down
4 changes: 4 additions & 0 deletions tests/unit/backend/layers/common/base_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,10 @@ def mock_config_fn(name):
feature_count=400,
feature_reference=["NCBITaxon:9606"],
raw_data_location="raw.X",
citation="Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: "
"https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by "
"CZ CELLxGENE Discover in Collection: "
"https://cellxgene.cziscience.com/collections/collection_id",
)

self.sample_collection_metadata = CollectionMetadata(
Expand Down
9 changes: 9 additions & 0 deletions tests/unit/processing/test_extract_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ def test_extract_metadata(self, mock_read_h5ad):
"batch_condition": np.array({"batchA", "batchB"}),
"schema_version": "3.0.0",
"default_embedding": "X_umap",
"citation": "Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: "
"https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by "
"CZ CELLxGENE Discover in Collection: "
"https://cellxgene.cziscience.com/collections/collection_id",
}

var = pandas.DataFrame(
Expand Down Expand Up @@ -166,6 +170,7 @@ def test_extract_metadata(self, mock_read_h5ad):
self.assertEqual(extracted_metadata.x_approximate_distribution, "NORMAL")
self.assertEqual(extracted_metadata.batch_condition, np.array({"batchA", "batchB"}))
self.assertEqual(extracted_metadata.schema_version, "3.0.0")
self.assertEqual(extracted_metadata.citation, uns["citation"])

self.assertEqual(extracted_metadata.cell_count, 50001)
self.assertEqual(extracted_metadata.primary_cell_count, 0)
Expand Down Expand Up @@ -249,6 +254,10 @@ def test_extract_metadata_find_raw_layer(self, mock_read_h5ad):
"X_approximate_distribution": "normal",
"batch_condition": np.array({"batchA", "batchB"}),
"schema_version": "3.0.0",
"citation": "Publication: https://doi.org/12.2345/science.abc1234 Dataset Version: "
"https://datasets.cellxgene.cziscience.com/dataset_id.h5ad curated and distributed by "
"CZ CELLxGENE Discover in Collection: "
"https://cellxgene.cziscience.com/collections/collection_id",
}

var = pandas.DataFrame(
Expand Down
Loading

0 comments on commit ca448b2

Please sign in to comment.