Skip to content

Commit

Permalink
feat: Schema 4.0.0 enrichment (#6273)
Browse files Browse the repository at this point in the history
  • Loading branch information
MillenniumFalconMechanic authored Nov 29, 2023
1 parent 9857ef6 commit 6192386
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 2 deletions.
26 changes: 26 additions & 0 deletions backend/portal/api/enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from collections import OrderedDict

from backend.common.feature_flag import FeatureFlagService, FeatureFlagValues


def enrich_dataset_with_ancestors(dataset, key, ontology_mapping):
"""
Expand All @@ -15,6 +17,16 @@ def enrich_dataset_with_ancestors(dataset, key, ontology_mapping):

terms = [e["ontology_term_id"] for e in dataset[key]]

is_schema_4 = FeatureFlagService.is_enabled(FeatureFlagValues.SCHEMA_4)
is_tissue = key == "tissue"
if is_tissue and is_schema_4:
# TODO remove is_schema_4 condition once Schema 4 is rolled out and
# feature flag is removed (#6266). "tissue" must include "tissue_type"
# when generating ancestors; "cell_type" and "development_stage" do not.
terms = [generate_tagged_tissue_ontology_id(e) for e in dataset[key]]
else:
terms = [e["ontology_term_id"] for e in dataset[key]]

if not terms:
return

Expand All @@ -23,3 +35,17 @@ def enrich_dataset_with_ancestors(dataset, key, ontology_mapping):
unique_ancestors = list(OrderedDict.fromkeys(flattened_ancestors))
if unique_ancestors:
dataset[f"{key}_ancestors"] = unique_ancestors


def generate_tagged_tissue_ontology_id(tissue):
"""
Generate ontology ID tagged with tissue_type for the given tissue. For
example, UBERON:1234567 (organoid).
"""
tissue_id = tissue["ontology_term_id"]
# Handle possible None for tissue_type (possible during migration): default
# to "tissue".
tissue_type = tissue["tissue_type"] or "tissue"
if tissue_type == "tissue":
return tissue_id
return f"{tissue_id} ({tissue_type})"
65 changes: 63 additions & 2 deletions tests/unit/backend/layers/api/test_portal_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1727,12 +1727,15 @@ def test__get_all_user_datasets_for_index_requires_auth(self):
self.assertEqual(response.status_code, 401)

# ✅
def test__get_all_datasets_for_index_with_ontology_expansion(self):
def test__get_all_datasets_for_index_with_ontology_expansion_deprecated(self):
# TODO deprecated - remove with #6266. Keeping temporarily to ensure
# backwards compatibility while running both 3.0.0 and 4.0.0 (behind
# a feature flag) versions of the code.
import copy

modified_metadata = copy.deepcopy(self.sample_dataset_metadata)
modified_metadata.development_stage = [OntologyTermId("Test", "HsapDv:0000008")]
modified_metadata.tissue = [TissueOntologyTermId("Test", "UBERON:0002048", "cell culture")]
modified_metadata.tissue = [TissueOntologyTermId("Test", "UBERON:0002048")]
modified_metadata.cell_type = [OntologyTermId("Test", "CL:0000738")]

dataset = self.generate_dataset(metadata=modified_metadata, publish=True)
Expand Down Expand Up @@ -1797,6 +1800,64 @@ def convert_ontology(ontologies):
],
)

def test__get_all_datasets_for_index_with_ontology_expansion(self):
# Schema 4.0.0 version of
# test__get_all_datasets_for_index_with_ontology_expansion_deprecated
# above. Remove this comment with #6266.
import copy

modified_metadata = copy.deepcopy(self.sample_dataset_metadata)
modified_metadata.development_stage = [OntologyTermId("Test", "HsapDv:0000008")]
modified_metadata.tissue = [TissueOntologyTermId("Test", "UBERON:0000995", "organoid")]
modified_metadata.cell_type = [OntologyTermId("Test", "CL:0000738")]

dataset = self.generate_dataset(metadata=modified_metadata, publish=True)

test_url = furl(path="/dp/v1/datasets/index")

headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()}
response = self.app.get(test_url.url, headers=headers)
self.assertEqual(200, response.status_code)
body = json.loads(response.data)

actual_dataset = None
for d in body:
if d["id"] == dataset.dataset_version_id:
actual_dataset = d
self.assertIsNotNone(actual_dataset)

def convert_ontology(ontologies):
return [dataclasses.asdict(o) for o in ontologies]

if actual_dataset is not None: # pylance
self.assertEqual(actual_dataset["development_stage"], convert_ontology(modified_metadata.development_stage))
self.assertEqual(
actual_dataset["development_stage_ancestors"],
["HsapDv:0000008", "HsapDv:0000006", "HsapDv:0000002", "HsapDv:0000045", "HsapDv:0000001"],
)

self.assertEqual(actual_dataset["tissue"], convert_ontology(modified_metadata.tissue))
# TODO update with fix for #6192.
self.assertCountEqual(
actual_dataset["tissue_ancestors"],
["UBERON:0000995 (organoid)"],
)

self.assertEqual(actual_dataset["cell_type"], convert_ontology(modified_metadata.cell_type))
self.assertCountEqual(
actual_dataset["cell_type_ancestors"],
[
"CL:0000255",
"CL:0002371",
"CL:0000988",
"CL:0000738",
"CL:0000548",
"CL:0000219",
"CL:0000003",
"CL:0002242",
],
)

# ✅
def test__get_dataset_assets(self):
# TODO: I don't think `filename` is relevant - review
Expand Down

0 comments on commit 6192386

Please sign in to comment.