Skip to content

Commit c286c3b

Browse files
refactor(vep_parser): store consequence to impact score as a project config (#811)
* refactor: set variant functional consequence to pathogenicity score as class attribute * chore: drop `VariantIndex.get_most_severe_gene_consequence` * chore(VariantIndex): make `CONSEQUENCE_TO_PATHOGENICITY_SCORE` a class attribute * fix(vep): convert `id_to_score_map` to `label_to_score_map` * chore: remove comment * refactor: move `consequence_to_pathogenicity_score` to `VariantIndexConfig`
1 parent 70fd593 commit c286c3b

File tree

6 files changed

+84
-136
lines changed

6 files changed

+84
-136
lines changed

src/gentropy/assets/data/variant_consequence_to_score.tsv

Lines changed: 0 additions & 46 deletions
This file was deleted.

src/gentropy/common/spark_helpers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -447,14 +447,14 @@ def order_array_of_structs_by_two_fields(
447447
)
448448

449449

450-
def map_column_by_dictionary(col: Column, mapping_dict: dict[str, str]) -> Column:
450+
def map_column_by_dictionary(col: Column, mapping_dict: dict[str, Any]) -> Column:
451451
"""Map column values to dictionary values by key.
452452
453453
Missing consequence label will be converted to None, unmapped consequences will be mapped as None.
454454
455455
Args:
456456
col (Column): Column containing labels to map.
457-
mapping_dict (dict[str, str]): Dictionary with mapping key/value pairs.
457+
mapping_dict (dict[str, Any]): Dictionary with mapping key/value pairs.
458458
459459
Returns:
460460
Column: Column with mapped values.

src/gentropy/config.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import os
44
from dataclasses import dataclass, field
5-
from typing import Any, List
5+
from typing import Any, ClassVar, List, TypedDict
66

77
from hail import __file__ as hail_location
88
from hydra.core.config_store import ConfigStore
@@ -348,11 +348,73 @@ class GnomadVariantConfig(StepConfig):
348348
class VariantIndexConfig(StepConfig):
349349
"""Variant index step configuration."""
350350

351+
class _ConsequenceToPathogenicityScoreMap(TypedDict):
352+
"""Typing definition for CONSEQUENCE_TO_PATHOGENICITY_SCORE."""
353+
354+
id: str
355+
label: str
356+
score: float
357+
351358
session: SessionConfig = SessionConfig()
352359
vep_output_json_path: str = MISSING
353360
variant_index_path: str = MISSING
354361
gnomad_variant_annotations_path: str | None = None
355362
hash_threshold: int = 300
363+
consequence_to_pathogenicity_score: ClassVar[
364+
list[_ConsequenceToPathogenicityScoreMap]
365+
] = [
366+
{"id": "SO_0001575", "label": "splice_donor_variant", "score": 1.0},
367+
{"id": "SO_0001589", "label": "frameshift_variant", "score": 1.0},
368+
{"id": "SO_0001574", "label": "splice_acceptor_variant", "score": 1.0},
369+
{"id": "SO_0001587", "label": "stop_gained", "score": 1.0},
370+
{"id": "SO_0002012", "label": "start_lost", "score": 1.0},
371+
{"id": "SO_0001578", "label": "stop_lost", "score": 1.0},
372+
{"id": "SO_0001893", "label": "transcript_ablation", "score": 1.0},
373+
{"id": "SO_0001822", "label": "inframe_deletion", "score": 0.66},
374+
{
375+
"id": "SO_0001818",
376+
"label": "protein_altering_variant",
377+
"score": 0.66,
378+
},
379+
{"id": "SO_0001821", "label": "inframe_insertion", "score": 0.66},
380+
{
381+
"id": "SO_0001787",
382+
"label": "splice_donor_5th_base_variant",
383+
"score": 0.66,
384+
},
385+
{"id": "SO_0001583", "label": "missense_variant", "score": 0.66},
386+
{"id": "SO_0001567", "label": "stop_retained_variant", "score": 0.33},
387+
{"id": "SO_0001630", "label": "splice_region_variant", "score": 0.33},
388+
{"id": "SO_0002019", "label": "start_retained_variant", "score": 0.33},
389+
{
390+
"id": "SO_0002169",
391+
"label": "splice_polypyrimidine_tract_variant",
392+
"score": 0.33,
393+
},
394+
{"id": "SO_0001819", "label": "synonymous_variant", "score": 0.33},
395+
{
396+
"id": "SO_0002170",
397+
"label": "splice_donor_region_variant",
398+
"score": 0.33,
399+
},
400+
{"id": "SO_0001624", "label": "3_prime_UTR_variant", "score": 0.1},
401+
{"id": "SO_0001623", "label": "5_prime_UTR_variant", "score": 0.1},
402+
{"id": "SO_0001627", "label": "intron_variant", "score": 0.1},
403+
{
404+
"id": "SO_0001619",
405+
"label": "non_coding_transcript_variant",
406+
"score": 0.0,
407+
},
408+
{"id": "SO_0001580", "label": "coding_sequence_variant", "score": 0.0},
409+
{"id": "SO_0001632", "label": "downstream_gene_variant", "score": 0.0},
410+
{"id": "SO_0001631", "label": "upstream_gene_variant", "score": 0.0},
411+
{
412+
"id": "SO_0001792",
413+
"label": "non_coding_transcript_exon_variant",
414+
"score": 0.0,
415+
},
416+
{"id": "SO_0001620", "label": "mature_miRNA_variant", "score": 0.0},
417+
]
356418

357419
_target_: str = "gentropy.variant_index.VariantIndexStep"
358420

src/gentropy/dataset/variant_index.py

Lines changed: 6 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from gentropy.common.schemas import parse_spark_schema
1212
from gentropy.common.spark_helpers import (
1313
get_nested_struct_schema,
14-
get_record_with_maximum_value,
1514
rename_all_columns,
1615
safe_array_union,
1716
)
@@ -22,7 +21,6 @@
2221
from pyspark.sql.types import StructType
2322

2423

25-
2624
@dataclass
2725
class VariantIndex(Dataset):
2826
"""Dataset for representing variants and methods applied on them."""
@@ -130,7 +128,6 @@ def add_annotation(
130128
# Prefix for renaming columns:
131129
prefix = "annotation_"
132130

133-
134131
# Generate select expressions that to merge and import columns from annotation:
135132
select_expressions = []
136133

@@ -146,9 +143,13 @@ def add_annotation(
146143
if isinstance(field.dataType.elementType, t.StructType):
147144
# Extract the schema of the array to get the order of the fields:
148145
array_schema = [
149-
field for field in VariantIndex.get_schema().fields if field.name == column
146+
field
147+
for field in VariantIndex.get_schema().fields
148+
if field.name == column
150149
][0].dataType
151-
fields_order = get_nested_struct_schema(array_schema).fieldNames()
150+
fields_order = get_nested_struct_schema(
151+
array_schema
152+
).fieldNames()
152153
select_expressions.append(
153154
safe_array_union(
154155
f.col(column), f.col(f"{prefix}{column}"), fields_order
@@ -286,48 +287,3 @@ def get_loftee(self: VariantIndex) -> DataFrame:
286287
"isHighQualityPlof",
287288
)
288289
)
289-
290-
def get_most_severe_gene_consequence(
291-
self: VariantIndex,
292-
*,
293-
vep_consequences: DataFrame,
294-
) -> DataFrame:
295-
"""Returns a dataframe with the most severe consequence for a variant/gene pair.
296-
297-
Args:
298-
vep_consequences (DataFrame): A dataframe of VEP consequences
299-
300-
Returns:
301-
DataFrame: A dataframe with the most severe consequence (plus a severity score) for a variant/gene pair
302-
"""
303-
return (
304-
self.df.select("variantId", f.explode("transcriptConsequences").alias("tc"))
305-
.select(
306-
"variantId",
307-
f.col("tc.targetId"),
308-
f.explode(f.col("tc.variantFunctionalConsequenceIds")).alias(
309-
"variantFunctionalConsequenceId"
310-
),
311-
)
312-
.join(
313-
# TODO: make this table a project config
314-
f.broadcast(
315-
vep_consequences.selectExpr(
316-
"variantFunctionalConsequenceId", "score as severityScore"
317-
)
318-
),
319-
on="variantFunctionalConsequenceId",
320-
how="inner",
321-
)
322-
.filter(f.col("severityScore").isNull())
323-
.transform(
324-
# A variant can have multiple predicted consequences on a transcript, the most severe one is selected
325-
lambda df: get_record_with_maximum_value(
326-
df, ["variantId", "targetId"], "severityScore"
327-
)
328-
)
329-
.withColumnRenamed(
330-
"variantFunctionalConsequenceId",
331-
"mostSevereVariantFunctionalConsequenceId",
332-
)
333-
)

src/gentropy/datasource/ensembl/vep_parser.py

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,12 @@
22

33
from __future__ import annotations
44

5-
import importlib.resources as pkg_resources
65
from typing import TYPE_CHECKING
76

8-
import pandas as pd
97
from pyspark.sql import SparkSession
108
from pyspark.sql import functions as f
119
from pyspark.sql import types as t
1210

13-
from gentropy.assets import data
1411
from gentropy.common.schemas import parse_spark_schema
1512
from gentropy.common.spark_helpers import (
1613
enforce_schema,
@@ -24,9 +21,12 @@
2421
if TYPE_CHECKING:
2522
from pyspark.sql import Column, DataFrame
2623

24+
from gentropy.config import VariantIndexConfig
25+
2726

2827
class VariantEffectPredictorParser:
2928
"""Collection of methods to parse VEP output in json format."""
29+
3030
# NOTE: Due to the fact that the comparison of the xrefs is done om the base of rsids
3131
# if the field `colocalised_variants` have multiple rsids, this extracting xrefs will result in
3232
# an array of xref structs, rather then the struct itself.
@@ -568,22 +568,16 @@ def process_vep_output(
568568
Returns:
569569
DataFrame: processed data in the right shape.
570570
"""
571-
so_df = pd.read_csv(
572-
pkg_resources.open_text(
573-
data, "variant_consequence_to_score.tsv", encoding="utf-8"
574-
),
575-
sep="\t",
576-
)
577-
578-
# Reading consequence to sequence ontology map:
571+
# Consequence to sequence ontology map:
579572
sequence_ontology_map = {
580-
row["label"]: row["variantFunctionalConsequenceId"]
581-
for _, row in so_df.iterrows()
573+
item["label"]: item["id"]
574+
for item in VariantIndexConfig.consequence_to_pathogenicity_score
575+
}
576+
# Sequence ontology to score map:
577+
label_to_score_map = {
578+
item["label"]: item["score"]
579+
for item in VariantIndexConfig.consequence_to_pathogenicity_score
582580
}
583-
584-
# Reading score dictionary:
585-
score_dictionary = {row["label"]: row["score"] for _, row in so_df.iterrows()}
586-
587581
# Processing VEP output:
588582
return (
589583
vep_output
@@ -694,7 +688,7 @@ def process_vep_output(
694688
f.transform(
695689
transcript.consequence_terms,
696690
lambda term: map_column_by_dictionary(
697-
term, score_dictionary
691+
term, label_to_score_map
698692
),
699693
)
700694
)

tests/gentropy/dataset/test_variant_index.py

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from gentropy.dataset.variant_index import VariantIndex
1212

1313
if TYPE_CHECKING:
14-
from pyspark.sql import DataFrame, SparkSession
14+
from pyspark.sql import SparkSession
1515

1616

1717
def test_variant_index_creation(mock_variant_index: VariantIndex) -> None:
@@ -144,24 +144,6 @@ def test_get_distance_to_gene(
144144
for col in expected_cols:
145145
assert col in observed.columns, f"Column {col} not in {observed.columns}"
146146

147-
def test_get_most_severe_gene_consequence(
148-
self: TestVariantIndex,
149-
mock_variant_index: VariantIndex,
150-
mock_variant_consequence_to_score: DataFrame,
151-
) -> None:
152-
"""Assert that the function returns a df with the requested columns."""
153-
expected_cols = [
154-
"variantId",
155-
"targetId",
156-
"mostSevereVariantFunctionalConsequenceId",
157-
"severityScore",
158-
]
159-
observed = mock_variant_index.get_most_severe_gene_consequence(
160-
vep_consequences=mock_variant_consequence_to_score
161-
)
162-
for col in expected_cols:
163-
assert col in observed.columns, f"Column {col} not in {observed.columns}"
164-
165147
def test_get_loftee(
166148
self: TestVariantIndex, mock_variant_index: VariantIndex
167149
) -> None:

0 commit comments

Comments
 (0)