diff --git a/src/metakb/transformers/base.py b/src/metakb/transformers/base.py index 9df5a1a5..8fa414a4 100644 --- a/src/metakb/transformers/base.py +++ b/src/metakb/transformers/base.py @@ -32,7 +32,12 @@ ) from ga4gh.va_spec.base import Document, Method, TherapyGroup from ga4gh.vrs.models import Allele -from gene.schemas import NormalizeService as NormalizedGene +from gene.schemas import ( + NamespacePrefix as GeneNamespacePrefix, +) +from gene.schemas import ( + NormalizeService as NormalizedGene, +) from pydantic import BaseModel, Field, StrictStr, ValidationError from therapy.schemas import NormalizationService as NormalizedTherapy @@ -554,20 +559,31 @@ def _get_vicc_normalizer_mappings( :return: List of VICC Normalizer data represented as mappable concept """ - def _add_merged_id_ext( + def _update_mapping( mapping: ConceptMapping, - is_priority: bool, - label: str | None = None, + normalized_id: str, + normalizer_label: str, + match_on_coding_id: bool = True, ) -> Extension: - """Update ``mapping`` to include extension on whether mapping is from merged identifier + """Update ``mapping`` to include extension on whether ``mapping`` contains + code that matches the merged record's primary identifier. :param mapping: ConceptMapping from vicc normalizer. This will be mutated. - :param is_priority: ``True`` if concept mapping contains primaryCode that - matches merged record primaryCode. ``False`` otherwise (meaning it comes - from merged record mappings) - :param label: Merged concept label, if found - :return: ConceptMapping with normalizer extension added + Extensions will be added. Label will be added if mapping identifier + matches normalized merged identifier. + :param normalized_id: Concept ID from normalized record + :param normalizer_label: Label from normalized record + :param match_on_coding_id: Whether to match on ``coding.id`` or + ``coding.code`` (MONDO is represented differently) + :return: ConceptMapping with normalizer extension added as well as label ( + if mapping id matches normalized merged id) """ + is_priority = ( + normalized_id == mapping.coding.id + if match_on_coding_id + else normalized_id == mapping.coding.code.root.lower() + ) + merged_id_ext = Extension( name=NormalizerExtensionName.PRIORITY.value, value=is_priority ) @@ -576,39 +592,44 @@ def _add_merged_id_ext( else: mapping.extensions = [merged_id_ext] - if label: - mapping.coding.label = label + if is_priority: + mapping.coding.label = normalizer_label return mapping mappings: list[ConceptMapping] = [] attr_name = NORMALIZER_INSTANCE_TO_ATTR[type(normalizer_resp)] normalizer_resp_obj = getattr(normalizer_resp, attr_name) + normalizer_label = normalizer_resp_obj.label + is_disease = isinstance(normalizer_resp, NormalizedDisease) + is_gene = isinstance(normalizer_resp, NormalizedGene) normalizer_mappings = normalizer_resp_obj.mappings or [] - if isinstance(normalizer_resp, NormalizedDisease): - for mapping in normalizer_mappings: - if mapping.coding.code.root.lower().startswith( - DiseaseNamespacePrefix.MONDO.value + for mapping in normalizer_mappings: + if normalized_id == mapping.coding.id: + mappings.append( + _update_mapping(mapping, normalized_id, normalizer_label) + ) + else: + if ( + is_disease + and mapping.coding.code.root.lower().startswith( + DiseaseNamespacePrefix.MONDO.value + ) + ) or ( + is_gene + and mapping.coding.id.startswith( + (GeneNamespacePrefix.NCBI.value, GeneNamespacePrefix.HGNC.value) + ) ): - mappings.append(_add_merged_id_ext(mapping, is_priority=False)) - else: - if normalized_id == mapping.coding.id: - mappings.append( - _add_merged_id_ext( - mapping, - label=normalizer_resp_obj.label, - is_priority=True, - ) + mappings.append( + _update_mapping( + mapping, + normalized_id, + normalizer_label, + match_on_coding_id=is_gene, ) - else: - mappings.extend( - _add_merged_id_ext( - mapping, label=normalizer_resp_obj.label, is_priority=True - ) - for mapping in normalizer_mappings - if normalized_id == mapping.coding.id - ) + ) return mappings def create_json(self, cdm_filepath: Path | None = None) -> None: diff --git a/src/metakb/transformers/civic.py b/src/metakb/transformers/civic.py index 088cae0b..c2b313ab 100644 --- a/src/metakb/transformers/civic.py +++ b/src/metakb/transformers/civic.py @@ -816,20 +816,27 @@ def _add_genes(self, genes: list[dict]) -> None: :param genes: All genes in CIViC """ + + def _get_ncbi_concept_mapping(ncbigene_id: str, gene: dict) -> ConceptMapping: + """Get NCBI gene mapping + + :param ncbigene_id: ID for NCBI Gene + :param gene: CIViC gene record + :return: Concept Mapping for NCBI Gene + """ + return ConceptMapping( + coding=Coding( + id=ncbigene_id, + code=str(gene["entrez_id"]), + system="https://www.ncbi.nlm.nih.gov/gene/", + ), + relation=Relation.EXACT_MATCH, + ) + for gene in genes: gene_id = f"civic.gid:{gene['id']}" ncbigene = f"ncbigene:{gene['entrez_id']}" queries = [ncbigene, gene["name"]] + gene["aliases"] - mappings = [ - ConceptMapping( - coding=Coding( - id=ncbigene, - code=str(gene["entrez_id"]), - system="https://www.ncbi.nlm.nih.gov/gene/", - ), - relation=Relation.EXACT_MATCH, - ), - ] extensions = [] gene_norm_resp, normalized_gene_id = self.vicc_normalizers.normalize_gene( @@ -843,13 +850,31 @@ def _add_genes(self, genes: list[dict]) -> None: queries, ) extensions.append(self._get_vicc_normalizer_failure_ext()) + mappings = [_get_ncbi_concept_mapping(ncbigene, gene)] else: - mappings.extend( - self._get_vicc_normalizer_mappings( - normalized_gene_id, gene_norm_resp - ) + mappings = self._get_vicc_normalizer_mappings( + normalized_gene_id, gene_norm_resp ) + civic_ncbi_annotation_match = False + for mapping in mappings: + if mapping.coding.id.startswith("ncbigene:"): + if mapping.coding.id == ncbigene: + mapping.extensions.append( + Extension(name="civic_annotation", value=True) + ) + civic_ncbi_annotation_match = True + break + + _logger.debug( + "CIViC NCBI gene and Gene Normalizer mismatch: %s vs %s", + ncbigene, + mapping.coding.id, + ) + + if not civic_ncbi_annotation_match: + mappings.append(_get_ncbi_concept_mapping(ncbigene, gene)) + if gene["aliases"]: extensions.append(Extension(name="aliases", value=gene["aliases"])) diff --git a/tests/conftest.py b/tests/conftest.py index 79e82216..78504076 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,6 +46,11 @@ def pytest_configure(config): logging.getLogger(lib).setLevel(logging.ERROR) +def get_vicc_normalizer_ext(is_priority: bool): + """Create test fixture for vicc normalizer priority extension""" + return [{"name": "vicc_normalizer_priority", "value": is_priority}] + + def check_source_harvest(tmp_path: Path, harvester: Harvester): """Test that source harvest method works correctly""" harvested_data = harvester.harvest() @@ -514,6 +519,10 @@ def civic_gid5(braf_normalizer_mappings): "system": "https://www.ncbi.nlm.nih.gov/gene/", }, "relation": "exactMatch", + "extensions": [ + *get_vicc_normalizer_ext(is_priority=False), + {"name": "civic_annotation", "value": True}, + ], }, *braf_normalizer_mappings, ], @@ -747,6 +756,10 @@ def civic_gid19(): "system": "https://www.ncbi.nlm.nih.gov/gene/", }, "relation": "exactMatch", + "extensions": [ + *get_vicc_normalizer_ext(is_priority=False), + {"name": "civic_annotation", "value": True}, + ], }, { "coding": { @@ -1594,6 +1607,10 @@ def civic_gid29(): "code": "3815", }, "relation": "exactMatch", + "extensions": [ + *get_vicc_normalizer_ext(is_priority=False), + {"name": "civic_annotation", "value": True}, + ], }, { "coding": { @@ -1805,8 +1822,17 @@ def moa_abl1(): "system": "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/", }, "relation": "exactMatch", - "extensions": get_vicc_normalizer_priority_ext(is_priority=True), - } + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, + { + "coding": { + "id": "ncbigene:25", + "code": "25", + "system": "https://www.ncbi.nlm.nih.gov/gene/", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], } diff --git a/tests/unit/transformers/test_civic_transformer_diagnostic.py b/tests/unit/transformers/test_civic_transformer_diagnostic.py index 6cdadfaf..8993b550 100644 --- a/tests/unit/transformers/test_civic_transformer_diagnostic.py +++ b/tests/unit/transformers/test_civic_transformer_diagnostic.py @@ -4,7 +4,11 @@ import pytest import pytest_asyncio -from tests.conftest import TEST_TRANSFORMERS_DIR, get_vicc_normalizer_priority_ext +from tests.conftest import ( + TEST_TRANSFORMERS_DIR, + get_vicc_normalizer_ext, + get_vicc_normalizer_priority_ext, +) from metakb.transformers.civic import CivicTransformer @@ -189,6 +193,10 @@ def civic_gid38(): "system": "https://www.ncbi.nlm.nih.gov/gene/", }, "relation": "exactMatch", + "extensions": [ + *get_vicc_normalizer_ext(is_priority=False), + {"name": "civic_annotation", "value": True}, + ], }, { "coding": { @@ -468,6 +476,10 @@ def civic_gid42(): "system": "https://www.ncbi.nlm.nih.gov/gene/", }, "relation": "exactMatch", + "extensions": [ + *get_vicc_normalizer_ext(is_priority=False), + {"name": "civic_annotation", "value": True}, + ], }, { "coding": { diff --git a/tests/unit/transformers/test_moa_transformer_prognostic.py b/tests/unit/transformers/test_moa_transformer_prognostic.py index 69ac9914..3aeb8170 100644 --- a/tests/unit/transformers/test_moa_transformer_prognostic.py +++ b/tests/unit/transformers/test_moa_transformer_prognostic.py @@ -4,7 +4,11 @@ import pytest import pytest_asyncio -from tests.conftest import TEST_TRANSFORMERS_DIR, get_vicc_normalizer_priority_ext +from tests.conftest import ( + TEST_TRANSFORMERS_DIR, + get_vicc_normalizer_ext, + get_vicc_normalizer_priority_ext, +) from metakb.transformers.moa import MoaTransformer @@ -165,6 +169,15 @@ def moa_bcor(): "relation": "exactMatch", "extensions": get_vicc_normalizer_priority_ext(is_priority=True), }, + { + "coding": { + "id": "ncbigene:54880", + "code": "54880", + "system": "https://www.ncbi.nlm.nih.gov/gene/", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], } @@ -333,6 +346,15 @@ def moa_sf3b1(): "relation": "exactMatch", "extensions": get_vicc_normalizer_priority_ext(is_priority=True), }, + { + "coding": { + "id": "ncbigene:23451", + "code": "23451", + "system": "https://www.ncbi.nlm.nih.gov/gene/", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], } diff --git a/tests/unit/transformers/test_moa_transformer_therapeutic.py b/tests/unit/transformers/test_moa_transformer_therapeutic.py index 0e3d7f21..306479c0 100644 --- a/tests/unit/transformers/test_moa_transformer_therapeutic.py +++ b/tests/unit/transformers/test_moa_transformer_therapeutic.py @@ -5,6 +5,7 @@ from tests.conftest import ( TEST_TRANSFORMERS_DIR, get_transformed_data, + get_vicc_normalizer_ext, get_vicc_normalizer_failure_ext, get_vicc_normalizer_priority_ext, ) @@ -144,6 +145,18 @@ def moa_aid154_study_stmt( braf_normalizer_mappings, ): """Create MOA AID 154 study statement test fixture. Uses CombinationTherapy.""" + braf_normalizer_mappings_cpy = braf_normalizer_mappings[:] + braf_normalizer_mappings_cpy.append( + { + "coding": { + "id": "ncbigene:673", + "code": "673", + "system": "https://www.ncbi.nlm.nih.gov/gene/", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, + ) return { "id": "moa.assertion:154", "type": "Statement", @@ -231,7 +244,7 @@ def moa_aid154_study_stmt( "id": "moa.normalize.gene.hgnc:1097", "conceptType": "Gene", "label": "BRAF", - "mappings": braf_normalizer_mappings, + "mappings": braf_normalizer_mappings_cpy, }, }, "specifiedBy": moa_method,