test: add transcript selection checks

GenomicMedLab · Jan 25, 2024 · c7f5826 · c7f5826
1 parent 01889fb
commit c7f5826
Show file tree

Hide file tree

Showing 5 changed files with 115 additions and 68 deletions.
diff --git a/misc/bug_hunting/check_transcript.py b/misc/bug_hunting/check_transcript.py
@@ -2,7 +2,7 @@
 import asyncio
 import pickle
 
-from cool_seq_tool.schemas import Strand
+from cool_seq_tool.schemas import Strand, TranscriptPriority
 
 from dcd_mapping.align import align
 from dcd_mapping.resources import get_scoreset_metadata, get_scoreset_records
@@ -11,7 +11,7 @@
 logging.basicConfig(
     filename="dcd-check-transcript.log",
     format="%(asctime)s %(levelname)s:%(name)s:%(message)s",
-    level=logging.WARNING,
+    level=logging.DEBUG,
     force=True,
 )
 _logger = logging.getLogger(__name__)
@@ -27,9 +27,6 @@
 
 async def check_tx_results():
     for urn in urns:
-        print(f"Checking {urn}...")
-        if urn != "urn:mavedb:00000083-i-1":
-            continue
         try:
             # prereqs
             metadata = get_scoreset_metadata(urn)
@@ -41,27 +38,35 @@ async def check_tx_results():
         try:
             tx_result = await select_transcript(metadata, records, alignment)
         except Exception as e:
-            _logger.error("%s error during transcript selection: %s", urn, e)
+            if urn in expected_mappings:
+                _logger.error("%s error during transcript selection: %s", urn, e)
+            else:
+                _logger.error("%s error during transcript selection (not in expected_mappings): %s", urn, e)
             continue
 
-        breakpoint()
         if urn not in mave_blat_dict:
             continue  # not in original experiment
-        if urn not in expected_mappings and tx_result is not None:
-            _logger.error("%s performed transcript selection when it shouldn't have", urn)
+        if urn not in expected_mappings:
+            if tx_result is not None:
+                _logger.error("%s performed transcript selection when it shouldn't have", urn)
+            else:
+                _logger.info("Skipping %s as expected", urn)
             continue
         if urn in expected_mappings and tx_result is None:
             _logger.error("%s skipped transcript selection when it shouldn't have", urn)
             continue
 
-        expected = expected_mappings[urn]
+        def reformat_mane(status: str):
+            return TranscriptPriority[status.replace(" ", "_").upper()]
+
         try:
+            expected = expected_mappings[urn]
             for (name, actual, expected) in [
                 ("NP accession", tx_result.np, expected[0]),
                 ("start position", tx_result.start, expected[1]),
                 ("is full match", tx_result.is_full_match, expected[3]),
                 ("NM accession", tx_result.nm, expected[4]),
-                ("Tx priority", tx_result.transcript_mode, expected[5])
+                ("Tx priority", tx_result.transcript_mode, reformat_mane(expected[5]))
             ]:
                 if actual != expected:
                     _logger.error(
@@ -74,4 +79,6 @@ async def check_tx_results():
         except Exception as e:
             _logger.error("%s exception: %s", urn, e)
 
+        _logger.info("%s completed successfully", urn)  # not a warning but w/e
+
 asyncio.run(check_tx_results())
diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py
@@ -3,7 +3,7 @@
 Outstanding questions/confusion
 -------------------------------
 * ``urn:mavedb:00000097-n-1``: unable to find any matching transcripts
-* Lots of scoresets (2023-) giving index errors in offset calculation
+* Lots of scoresets (esp. 2023-) giving index errors in offset calculation
 * Remove MANE sorting once upstream sorting is confirmed
 """
 import logging
@@ -274,7 +274,8 @@ def _offset_target_sequence(metadata: ScoresetMetadata, records: List[ScoreRow])
                 protein_sequence[i + p1 - p0] == amino_acids_by_position[p1],
                 protein_sequence[i + p2 - p0] == amino_acids_by_position[p2],
                 protein_sequence[i + p3 - p0] == amino_acids_by_position[p3],
-                protein_sequence[i + p4 - p0] == amino_acids_by_position[p4],
+                protein_sequence[i + p4 - p0]
+                == amino_acids_by_position[p4],  # TODO problem here-ish
             ]
         ):
             if i + 1 == min(amino_acids_by_position.keys()) or i + 2 == min(

diff --git a/tests/fixtures/align_result.json b/tests/fixtures/align_result.json
@@ -23,16 +23,6 @@
       { "start": 37403170, "end": 37403325 }
     ]
   },
-  "urn:mavedb:00000098-a-1": {
-    "chrom": "chr3",
-    "strand": -1,
-    "coverage": 100.0,
-    "ident_pct": 100.0,
-    "query_range": { "start": 0, "end": 12 },
-    "query_subranges": [{ "start": 0, "end": 12 }],
-    "hit_range": { "start": 38551475, "end": 38551511 },
-    "hit_subranges": [{ "start": 38551475, "end": 38551511 }]
-  },
   "urn:mavedb:00000018-a-1": {
     "chrom": "chr11",
     "strand": 1,
@@ -43,6 +33,40 @@
     "hit_range": { "start": 5227021, "end": 5227208 },
     "hit_subranges": [{ "start": 5227021, "end": 5227208 }]
   },
+  "urn:mavedb:00000001-a-4": {
+    "chrom": "chr16",
+    "strand": 1,
+    "coverage": 100.0,
+    "ident_pct": 100.0,
+    "query_range": { "start": 0, "end": 477 },
+    "query_subranges": [
+      { "start": 0, "end": 66 },
+      { "start": 66, "end": 150 },
+      { "start": 150, "end": 223 },
+      { "start": 223, "end": 333 },
+      { "start": 333, "end": 413 },
+      { "start": 413, "end": 477 }
+    ],
+    "hit_range": { "start": 1314030, "end": 1324793 },
+    "hit_subranges": [
+      { "start": 1314030, "end": 1314096 },
+      { "start": 1314292, "end": 1314376 },
+      { "start": 1315653, "end": 1315726 },
+      { "start": 1320173, "end": 1320283 },
+      { "start": 1320437, "end": 1320517 },
+      { "start": 1324729, "end": 1324793 }
+    ]
+  },
+  "urn:mavedb:00000098-a-1": {
+    "chrom": "chr3",
+    "strand": -1,
+    "coverage": 100.0,
+    "ident_pct": 100.0,
+    "query_range": { "start": 0, "end": 12 },
+    "query_subranges": [{ "start": 0, "end": 12 }],
+    "hit_range": { "start": 38551475, "end": 38551511 },
+    "hit_subranges": [{ "start": 38551475, "end": 38551511 }]
+  },
   "urn:mavedb:00000061-h-1": {
     "chrom": "chr3",
     "strand": -1,

diff --git a/tests/fixtures/transcript_result.json b/tests/fixtures/transcript_result.json
@@ -7,28 +7,35 @@
     "transcript_mode": "mane_select",
     "sequence": "TODO"
   },
-    "urn:mavedb:00000001-a-4": {
-        "nm": "NM_003345.5",
-        "np": "NP_003336.1",
-        "start": 0,
-        "is_full_match": true,
-        "transcript_mode": "mane_select",
-        "sequence": "TODO"
-    },
-    "urn:mavedb:00000098-a-1": {
-        "nm": "NM_000335.5",
-        "np": "NP_000326.2",
-        "start": 1619,
-        "is_full_match": true,
-        "transcript_mode": "mane_select",
-        "sequence": "TODO"
-    },
-    "urn:mavedb:00000061-h-1": {
-        "nm": "NM_002880.4",
-        "np": "NP_002871.1",
-        "start": 51,
-        "is_full_match": true,
-        "transcript_mode": "mane_select",
-        "sequence": "TODO"
-    }
+  "urn:mavedb:00000001-a-4": {
+    "nm": "NM_003345.5",
+    "np": "NP_003336.1",
+    "start": 0,
+    "is_full_match": true,
+    "transcript_mode": "mane_select",
+    "sequence": "TODO"
+  },
+  "urn:mavedb:00000098-a-1": {
+    "nm": "NM_000335.5",
+    "np": "NP_000326.2",
+    "start": 1619,
+    "is_full_match": true,
+    "transcript_mode": "mane_select",
+    "sequence": "TODO"
+  },
+  "urn:mavedb:00000061-h-1": {
+    "nm": "NM_002880.4",
+    "np": "NP_002871.1",
+    "start": 51,
+    "is_full_match": true,
+    "transcript_mode": "mane_select",
+    "sequence": "TODO"
+  },
+  "urn:mavedb:00000068-a-1": {
+    "nm": "NM_000546.6",
+    "np": "NP_000537.3",
+    "start": 0,
+    "is_full_match": false,
+    "transcript_mode": "mane_select"
+  }
 }
diff --git a/tests/unit/test_transcript.py b/tests/unit/test_transcript.py
@@ -8,6 +8,15 @@
 from dcd_mapping.transcripts import select_transcript
 
 
+def check_transcript_results_equality(actual: TxSelectResult, expected: TxSelectResult):
+    """Check equality of transcript selection result vs fixture"""
+    assert actual.np == expected.np
+    assert actual.start == expected.start
+    assert actual.is_full_match is expected.is_full_match
+    assert actual.nm == expected.nm
+    assert actual.transcript_mode == expected.transcript_mode
+
+
 @pytest.mark.asyncio(scope="module")
 async def test_tx_src(
     scoreset_metadata_fixture: Dict[str, ScoresetMetadata],
@@ -17,18 +26,12 @@ async def test_tx_src(
     """Test transcript selection for urn:mavedb:00000041-a-1"""
     urn = "urn:mavedb:00000041-a-1"
     metadata = scoreset_metadata_fixture[urn]
-    records = get_scoreset_records(urn)  # TODO real fixture
+    records = get_scoreset_records(urn)
     alignment_result = align_result_fixture[urn]
     expected = transcript_results_fixture[urn]
-
     actual = await select_transcript(metadata, records, alignment_result)
-
     assert actual
-    assert actual.np == expected.np
-    assert actual.start == expected.start
-    assert actual.is_full_match is expected.is_full_match
-    assert actual.nm == expected.nm
-    assert actual.transcript_mode == expected.transcript_mode
+    check_transcript_results_equality(actual, expected)
 
 
 @pytest.mark.asyncio(scope="module")
@@ -43,15 +46,9 @@ async def test_tx_scn5a(
     records = get_scoreset_records(urn)
     alignment_result = align_result_fixture[urn]
     expected = transcript_results_fixture[urn]
-
     actual = await select_transcript(metadata, records, alignment_result)
-
     assert actual
-    assert actual.np == expected.np
-    assert actual.start == expected.start
-    assert actual.is_full_match is expected.is_full_match
-    assert actual.nm == expected.nm
-    assert actual.transcript_mode == expected.transcript_mode
+    check_transcript_results_equality(actual, expected)
 
 
 @pytest.mark.asyncio(scope="module")
@@ -64,7 +61,6 @@ async def test_tx_hbb(
     metadata = scoreset_metadata_fixture[urn]
     records = get_scoreset_records(urn)
     alignment_result = align_result_fixture[urn]
-
     actual = await select_transcript(metadata, records, alignment_result)
     assert actual is None
 
@@ -81,11 +77,23 @@ async def test_tx_raf(
     records = get_scoreset_records(urn)
     alignment_result = align_result_fixture[urn]
     expected = transcript_results_fixture[urn]
+    actual = await select_transcript(metadata, records, alignment_result)
+    assert actual
+    check_transcript_results_equality(actual, expected)
 
+
+@pytest.mark.asyncio(scope="module")
+async def test_tx_tp53(
+    scoreset_metadata_fixture: Dict[str, ScoresetMetadata],
+    align_result_fixture: Dict[str, AlignmentResult],
+    transcript_results_fixture: Dict[str, TxSelectResult],
+):
+    """Test transcript selection for urn:mavedb:00000068-a-1"""
+    urn = "urn:mavedb:00000068-a-1"
+    metadata = scoreset_metadata_fixture[urn]
+    records = get_scoreset_records(urn)
+    alignment_result = align_result_fixture[urn]
+    expected = transcript_results_fixture[urn]
     actual = await select_transcript(metadata, records, alignment_result)
     assert actual
-    assert actual.np == expected.np
-    assert actual.start == expected.start
-    assert actual.is_full_match is expected.is_full_match
-    assert actual.nm == expected.nm
-    assert actual.transcript_mode == expected.transcript_mode
+    check_transcript_results_equality(actual, expected)