feat: Schema 4.0.0 enrichment (#6273)

chanzuckerberg · Nov 29, 2023 · 6192386 · 6192386
1 parent 9857ef6
commit 6192386
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 2 deletions.
diff --git a/backend/portal/api/enrichment.py b/backend/portal/api/enrichment.py
@@ -5,6 +5,8 @@
 
 from collections import OrderedDict
 
+from backend.common.feature_flag import FeatureFlagService, FeatureFlagValues
+
 
 def enrich_dataset_with_ancestors(dataset, key, ontology_mapping):
     """
@@ -15,6 +17,16 @@ def enrich_dataset_with_ancestors(dataset, key, ontology_mapping):
 
     terms = [e["ontology_term_id"] for e in dataset[key]]
 
+    is_schema_4 = FeatureFlagService.is_enabled(FeatureFlagValues.SCHEMA_4)
+    is_tissue = key == "tissue"
+    if is_tissue and is_schema_4:
+        # TODO remove is_schema_4 condition once Schema 4 is rolled out and
+        # feature flag is removed (#6266). "tissue" must include "tissue_type"
+        # when generating ancestors; "cell_type" and "development_stage" do not.
+        terms = [generate_tagged_tissue_ontology_id(e) for e in dataset[key]]
+    else:
+        terms = [e["ontology_term_id"] for e in dataset[key]]
+
     if not terms:
         return
 
@@ -23,3 +35,17 @@ def enrich_dataset_with_ancestors(dataset, key, ontology_mapping):
     unique_ancestors = list(OrderedDict.fromkeys(flattened_ancestors))
     if unique_ancestors:
         dataset[f"{key}_ancestors"] = unique_ancestors
+
+
+def generate_tagged_tissue_ontology_id(tissue):
+    """
+    Generate ontology ID tagged with tissue_type for the given tissue. For
+    example, UBERON:1234567 (organoid).
+    """
+    tissue_id = tissue["ontology_term_id"]
+    # Handle possible None for tissue_type (possible during migration): default
+    # to "tissue".
+    tissue_type = tissue["tissue_type"] or "tissue"
+    if tissue_type == "tissue":
+        return tissue_id
+    return f"{tissue_id} ({tissue_type})"
diff --git a/tests/unit/backend/layers/api/test_portal_api.py b/tests/unit/backend/layers/api/test_portal_api.py
@@ -1727,12 +1727,15 @@ def test__get_all_user_datasets_for_index_requires_auth(self):
         self.assertEqual(response.status_code, 401)
 
     # ✅
-    def test__get_all_datasets_for_index_with_ontology_expansion(self):
+    def test__get_all_datasets_for_index_with_ontology_expansion_deprecated(self):
+        # TODO deprecated - remove with #6266. Keeping temporarily to ensure
+        # backwards compatibility while running both 3.0.0 and 4.0.0 (behind
+        # a feature flag) versions of the code.
         import copy
 
         modified_metadata = copy.deepcopy(self.sample_dataset_metadata)
         modified_metadata.development_stage = [OntologyTermId("Test", "HsapDv:0000008")]
-        modified_metadata.tissue = [TissueOntologyTermId("Test", "UBERON:0002048", "cell culture")]
+        modified_metadata.tissue = [TissueOntologyTermId("Test", "UBERON:0002048")]
         modified_metadata.cell_type = [OntologyTermId("Test", "CL:0000738")]
 
         dataset = self.generate_dataset(metadata=modified_metadata, publish=True)
@@ -1797,6 +1800,64 @@ def convert_ontology(ontologies):
                 ],
             )
 
+    def test__get_all_datasets_for_index_with_ontology_expansion(self):
+        # Schema 4.0.0 version of
+        # test__get_all_datasets_for_index_with_ontology_expansion_deprecated
+        # above. Remove this comment with #6266.
+        import copy
+
+        modified_metadata = copy.deepcopy(self.sample_dataset_metadata)
+        modified_metadata.development_stage = [OntologyTermId("Test", "HsapDv:0000008")]
+        modified_metadata.tissue = [TissueOntologyTermId("Test", "UBERON:0000995", "organoid")]
+        modified_metadata.cell_type = [OntologyTermId("Test", "CL:0000738")]
+
+        dataset = self.generate_dataset(metadata=modified_metadata, publish=True)
+
+        test_url = furl(path="/dp/v1/datasets/index")
+
+        headers = {"host": "localhost", "Content-Type": "application/json", "Cookie": self.get_cxguser_token()}
+        response = self.app.get(test_url.url, headers=headers)
+        self.assertEqual(200, response.status_code)
+        body = json.loads(response.data)
+
+        actual_dataset = None
+        for d in body:
+            if d["id"] == dataset.dataset_version_id:
+                actual_dataset = d
+        self.assertIsNotNone(actual_dataset)
+
+        def convert_ontology(ontologies):
+            return [dataclasses.asdict(o) for o in ontologies]
+
+        if actual_dataset is not None:  # pylance
+            self.assertEqual(actual_dataset["development_stage"], convert_ontology(modified_metadata.development_stage))
+            self.assertEqual(
+                actual_dataset["development_stage_ancestors"],
+                ["HsapDv:0000008", "HsapDv:0000006", "HsapDv:0000002", "HsapDv:0000045", "HsapDv:0000001"],
+            )
+
+            self.assertEqual(actual_dataset["tissue"], convert_ontology(modified_metadata.tissue))
+            # TODO update with fix for #6192.
+            self.assertCountEqual(
+                actual_dataset["tissue_ancestors"],
+                ["UBERON:0000995 (organoid)"],
+            )
+
+            self.assertEqual(actual_dataset["cell_type"], convert_ontology(modified_metadata.cell_type))
+            self.assertCountEqual(
+                actual_dataset["cell_type_ancestors"],
+                [
+                    "CL:0000255",
+                    "CL:0002371",
+                    "CL:0000988",
+                    "CL:0000738",
+                    "CL:0000548",
+                    "CL:0000219",
+                    "CL:0000003",
+                    "CL:0002242",
+                ],
+            )
+
     # ✅
     def test__get_dataset_assets(self):
         # TODO: I don't think `filename` is relevant - review