Merge branch 'issue-426' into disease-conflict

korikuzma · korikuzma · commit a9a2042f1a2a · 2025-02-06T08:23:03.000-05:00
diff --git a/src/metakb/transformers/base.py b/src/metakb/transformers/base.py
@@ -9,9 +9,6 @@
 from pathlib import Path
 from typing import ClassVar, TypeVar
 
-from disease.schemas import (
-    SYSTEM_URI_TO_NAMESPACE as DISEASE_SYSTEM_URI_TO_NAMESPACE,
-)
 from disease.schemas import (
     NamespacePrefix as DiseaseNamespacePrefix,
 )
@@ -35,7 +32,12 @@
 )
 from ga4gh.va_spec.base import Document, Method, TherapyGroup
 from ga4gh.vrs.models import Allele
-from gene.schemas import NormalizeService as NormalizedGene
+from gene.schemas import (
+    NamespacePrefix as GeneNamespacePrefix,
+)
+from gene.schemas import (
+    NormalizeService as NormalizedGene,
+)
 from pydantic import BaseModel, Field, StrictStr, ValidationError
 from therapy.schemas import NormalizationService as NormalizedTherapy
 
@@ -557,20 +559,24 @@ def _get_vicc_normalizer_mappings(
         :return: List of VICC Normalizer data represented as mappable concept
         """
 
-        def _add_merged_id_ext(
+        def _update_mapping(
             mapping: ConceptMapping,
-            is_priority: bool,
-            label: str | None = None,
+            normalized_id: str,
+            normalizer_label: str,
         ) -> Extension:
-            """Update ``mapping`` to include extension on whether mapping is from merged identifier
+            """Update ``mapping`` to include extension on whether ``mapping`` contains
+            code that matches the merged record's primary identifier.
 
             :param mapping: ConceptMapping from vicc normalizer. This will be mutated.
-            :param is_priority: ``True`` if concept mapping contains primaryCode that
-                matches merged record primaryCode. ``False`` otherwise (meaning it comes
-                from merged record mappings)
-            :param label: Merged concept label, if found
-            :return: ConceptMapping with normalizer extension added
+                Extensions will be added. Label will be added if mapping identifier
+                matches normalized merged identifier.
+            :param normalized_id: Concept ID from normalized record
+            :param normalizer_label: Label from normalized record
+            :return: ConceptMapping with normalizer extension added as well as label (
+                if mapping id matches normalized merged id)
             """
+            is_priority = normalized_id == mapping.coding.code.root
+
             merged_id_ext = Extension(
                 name=NormalizerExtensionName.PRIORITY.value, value=is_priority
             )
@@ -579,40 +585,40 @@ def _add_merged_id_ext(
             else:
                 mapping.extensions = [merged_id_ext]
 
-            if label:
-                mapping.coding.label = label
+            if is_priority:
+                mapping.coding.label = normalizer_label
 
             return mapping
 
         mappings: list[ConceptMapping] = []
         attr_name = NORMALIZER_INSTANCE_TO_ATTR[type(normalizer_resp)]
         normalizer_resp_obj = getattr(normalizer_resp, attr_name)
+        normalizer_label = normalizer_resp_obj.label
+        is_disease = isinstance(normalizer_resp, NormalizedDisease)
+        is_gene = isinstance(normalizer_resp, NormalizedGene)
 
         normalizer_mappings = normalizer_resp_obj.mappings or []
-        if isinstance(normalizer_resp, NormalizedDisease):
-            for mapping in normalizer_mappings:
+        for mapping in normalizer_mappings:
+            if normalized_id == mapping.coding.code.root:
+                mappings.append(
+                    _update_mapping(mapping, normalized_id, normalizer_label)
+                )
+            else:
+                mapping_code_lower = mapping.coding.code.root.lower()
                 if (
-                    DISEASE_SYSTEM_URI_TO_NAMESPACE.get(mapping.coding.system)
-                    == DiseaseNamespacePrefix.MONDO.value
+                    is_disease
+                    and mapping_code_lower.startswith(
+                        DiseaseNamespacePrefix.MONDO.value
+                    )
+                ) or (
+                    is_gene
+                    and mapping_code_lower.startswith(
+                        (GeneNamespacePrefix.NCBI.value, GeneNamespacePrefix.HGNC.value)
+                    )
                 ):
-                    mappings.append(_add_merged_id_ext(mapping, is_priority=False))
-                else:
-                    if normalized_id == mapping.coding.code.root:
-                        mappings.append(
-                            _add_merged_id_ext(
-                                mapping,
-                                label=normalizer_resp_obj.label,
-                                is_priority=True,
-                            )
-                        )
-        else:
-            mappings.extend(
-                _add_merged_id_ext(
-                    mapping, label=normalizer_resp_obj.label, is_priority=True
-                )
-                for mapping in normalizer_mappings
-                if normalized_id == mapping.coding.code.root
-            )
+                    mappings.append(
+                        _update_mapping(mapping, normalized_id, normalizer_label)
+                    )
         return mappings
 
     def create_json(self, cdm_filepath: Path | None = None) -> None:
diff --git a/src/metakb/transformers/civic.py b/src/metakb/transformers/civic.py
@@ -816,20 +816,27 @@ def _add_genes(self, genes: list[dict]) -> None:
 
         :param genes: All genes in CIViC
         """
+
+        def _get_ncbi_concept_mapping(ncbigene_id: str, gene: dict) -> ConceptMapping:
+            """Get NCBI gene mapping
+
+            :param ncbigene_id: ID for NCBI Gene
+            :param gene: CIViC gene record
+            :return: Concept Mapping for NCBI Gene
+            """
+            return ConceptMapping(
+                coding=Coding(
+                    id=ncbigene_id,
+                    code=str(gene["entrez_id"]),
+                    system="https://www.ncbi.nlm.nih.gov/gene/",
+                ),
+                relation=Relation.EXACT_MATCH,
+            )
+
         for gene in genes:
             gene_id = f"civic.gid:{gene['id']}"
             ncbigene = f"ncbigene:{gene['entrez_id']}"
             queries = [ncbigene, gene["name"]] + gene["aliases"]
-            mappings = [
-                ConceptMapping(
-                    coding=Coding(
-                        id=ncbigene,
-                        code=str(gene["entrez_id"]),
-                        system="https://www.ncbi.nlm.nih.gov/gene/",
-                    ),
-                    relation=Relation.EXACT_MATCH,
-                ),
-            ]
             extensions = []
 
             gene_norm_resp, normalized_gene_id = self.vicc_normalizers.normalize_gene(
@@ -843,13 +850,31 @@ def _add_genes(self, genes: list[dict]) -> None:
                     queries,
                 )
                 extensions.append(self._get_vicc_normalizer_failure_ext())
+                mappings = [_get_ncbi_concept_mapping(ncbigene, gene)]
             else:
-                mappings.extend(
-                    self._get_vicc_normalizer_mappings(
-                        normalized_gene_id, gene_norm_resp
-                    )
+                mappings = self._get_vicc_normalizer_mappings(
+                    normalized_gene_id, gene_norm_resp
                 )
 
+                civic_ncbi_annotation_match = False
+                for mapping in mappings:
+                    if mapping.coding.code.root.startswith("ncbigene:"):
+                        if mapping.coding.code.root == ncbigene:
+                            mapping.extensions.append(
+                                Extension(name="civic_annotation", value=True)
+                            )
+                            civic_ncbi_annotation_match = True
+                            break
+
+                        _logger.debug(
+                            "CIViC NCBI gene and Gene Normalizer mismatch: %s vs %s",
+                            ncbigene,
+                            mapping.coding.code.root,
+                        )
+
+                if not civic_ncbi_annotation_match:
+                    mappings.append(_get_ncbi_concept_mapping(ncbigene, gene))
+
             if gene["aliases"]:
                 extensions.append(Extension(name="aliases", value=gene["aliases"]))
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -46,6 +46,11 @@ def pytest_configure(config):
             logging.getLogger(lib).setLevel(logging.ERROR)
 
 
+def get_vicc_normalizer_ext(is_priority: bool):
+    """Create test fixture for vicc normalizer priority extension"""
+    return [{"name": "vicc_normalizer_priority", "value": is_priority}]
+
+
 def check_source_harvest(tmp_path: Path, harvester: Harvester):
     """Test that source harvest method works correctly"""
     harvested_data = harvester.harvest()
@@ -499,11 +504,14 @@ def civic_gid5(braf_normalizer_mappings):
         "mappings": [
             {
                 "coding": {
-                    "id": "ncbigene:673",
-                    "code": "673",
+                    "code": "ncbigene:673",
                     "system": "https://www.ncbi.nlm.nih.gov/gene/",
                 },
-                "relation": "exactMatch",
+                "relation": "relatedMatch",
+                "extensions": [
+                    *get_vicc_normalizer_ext(is_priority=False),
+                    {"name": "civic_annotation", "value": True},
+                ],
             },
             *braf_normalizer_mappings,
         ],
@@ -732,11 +740,14 @@ def civic_gid19():
         "mappings": [
             {
                 "coding": {
-                    "id": "ncbigene:1956",
-                    "code": "1956",
+                    "code": "ncbigene:1956",
                     "system": "https://www.ncbi.nlm.nih.gov/gene/",
                 },
-                "relation": "exactMatch",
+                "relation": "relatedMatch",
+                "extensions": [
+                    *get_vicc_normalizer_ext(is_priority=False),
+                    {"name": "civic_annotation", "value": True},
+                ],
             },
             {
                 "coding": {
@@ -1568,10 +1579,13 @@ def civic_gid29():
             {
                 "coding": {
                     "system": "https://www.ncbi.nlm.nih.gov/gene/",
-                    "id": "ncbigene:3815",
-                    "code": "3815",
+                    "code": "ncbigene:3815",
                 },
-                "relation": "exactMatch",
+                "relation": "relatedMatch",
+                "extensions": [
+                    *get_vicc_normalizer_ext(is_priority=False),
+                    {"name": "civic_annotation", "value": True},
+                ],
             },
             {
                 "coding": {
@@ -1781,8 +1795,16 @@ def moa_abl1():
                     "system": "https://www.genenames.org",
                 },
                 "relation": "exactMatch",
-                "extensions": get_vicc_normalizer_priority_ext(is_priority=True),
-            }
+                "extensions": get_vicc_normalizer_ext(is_priority=True),
+            },
+            {
+                "coding": {
+                    "code": "ncbigene:25",
+                    "system": "https://www.ncbi.nlm.nih.gov/gene/",
+                },
+                "relation": "relatedMatch",
+                "extensions": get_vicc_normalizer_ext(is_priority=False),
+            },
         ],
     }
 
diff --git a/tests/unit/transformers/test_civic_transformer_diagnostic.py b/tests/unit/transformers/test_civic_transformer_diagnostic.py
@@ -4,7 +4,11 @@
 
 import pytest
 import pytest_asyncio
-from tests.conftest import TEST_TRANSFORMERS_DIR, get_vicc_normalizer_priority_ext
+from tests.conftest import (
+    TEST_TRANSFORMERS_DIR,
+    get_vicc_normalizer_ext,
+    get_vicc_normalizer_priority_ext,
+)
 
 from metakb.transformers.civic import CivicTransformer
 
@@ -184,11 +188,14 @@ def civic_gid38():
         "mappings": [
             {
                 "coding": {
-                    "id": "ncbigene:5156",
-                    "code": "5156",
+                    "code": "ncbigene:5156",
                     "system": "https://www.ncbi.nlm.nih.gov/gene/",
                 },
-                "relation": "exactMatch",
+                "relation": "relatedMatch",
+                "extensions": [
+                    *get_vicc_normalizer_ext(is_priority=False),
+                    {"name": "civic_annotation", "value": True},
+                ],
             },
             {
                 "coding": {
@@ -460,11 +467,14 @@ def civic_gid42():
         "mappings": [
             {
                 "coding": {
-                    "id": "ncbigene:5979",
-                    "code": "5979",
+                    "code": "ncbigene:5979",
                     "system": "https://www.ncbi.nlm.nih.gov/gene/",
                 },
-                "relation": "exactMatch",
+                "relation": "relatedMatch",
+                "extensions": [
+                    *get_vicc_normalizer_ext(is_priority=False),
+                    {"name": "civic_annotation", "value": True},
+                ],
             },
             {
                 "coding": {
diff --git a/tests/unit/transformers/test_moa_transformer_prognostic.py b/tests/unit/transformers/test_moa_transformer_prognostic.py
@@ -4,7 +4,11 @@
 
 import pytest
 import pytest_asyncio
-from tests.conftest import TEST_TRANSFORMERS_DIR, get_vicc_normalizer_priority_ext
+from tests.conftest import (
+    TEST_TRANSFORMERS_DIR,
+    get_vicc_normalizer_ext,
+    get_vicc_normalizer_priority_ext,
+)
 
 from metakb.transformers.moa import MoaTransformer
 
@@ -162,6 +166,14 @@ def moa_bcor():
                 "relation": "exactMatch",
                 "extensions": get_vicc_normalizer_priority_ext(is_priority=True),
             },
+            {
+                "coding": {
+                    "code": "ncbigene:54880",
+                    "system": "https://www.ncbi.nlm.nih.gov/gene/",
+                },
+                "relation": "relatedMatch",
+                "extensions": get_vicc_normalizer_ext(is_priority=False),
+            },
         ],
     }
 
@@ -329,6 +341,14 @@ def moa_sf3b1():
                 "relation": "exactMatch",
                 "extensions": get_vicc_normalizer_priority_ext(is_priority=True),
             },
+            {
+                "coding": {
+                    "code": "ncbigene:23451",
+                    "system": "https://www.ncbi.nlm.nih.gov/gene/",
+                },
+                "relation": "relatedMatch",
+                "extensions": get_vicc_normalizer_ext(is_priority=False),
+            },
         ],
     }
 
diff --git a/tests/unit/transformers/test_moa_transformer_therapeutic.py b/tests/unit/transformers/test_moa_transformer_therapeutic.py