Skip to content

Commit 7b20d55

Browse files
feat(l2g): extend colocalisation neighbourhood metrics to missing genes in the vicinity (#851)
* feat(l2g): wip - limit colocalisation neighbourhood to protein coding genes * feat: adjust logic in `common_neighbourhood_colocalisation_feature_logic` * fix: correct logic in `common_neighbourhood_vep_feature_logic` * chore: adjust tests * chore: wip * feat: add `extend_missing_colocalisation_to_neighbourhood_genes` * feat: add variant index as a dependency to the colocalisation nbh features * test: add `test_extend_missing_colocalisation_to_neighbourhood_genes` * chore: minor * chore: fixes tests
1 parent 13c2040 commit 7b20d55

File tree

4 files changed

+252
-79
lines changed

4 files changed

+252
-79
lines changed

docs/python_api/datasets/l2g_features/colocalisation.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,5 @@ title: From colocalisation
2020
## Common logic
2121

2222
::: gentropy.dataset.l2g_features.colocalisation.common_colocalisation_feature_logic
23+
::: gentropy.dataset.l2g_features.colocalisation.extend_missing_colocalisation_to_neighbourhood_genes
2324
::: gentropy.dataset.l2g_features.colocalisation.common_neighbourhood_colocalisation_feature_logic

src/gentropy/dataset/l2g_features/colocalisation.py

Lines changed: 108 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from gentropy.dataset.l2g_gold_standard import L2GGoldStandard
1414
from gentropy.dataset.study_index import StudyIndex
1515
from gentropy.dataset.study_locus import StudyLocus
16+
from gentropy.dataset.variant_index import VariantIndex
1617

1718
if TYPE_CHECKING:
1819
from pyspark.sql import DataFrame
@@ -68,6 +69,54 @@ def common_colocalisation_feature_logic(
6869
)
6970

7071

72+
def extend_missing_colocalisation_to_neighbourhood_genes(
73+
feature_name: str,
74+
local_features: DataFrame,
75+
variant_index: VariantIndex,
76+
gene_index: GeneIndex,
77+
study_locus: StudyLocus,
78+
) -> DataFrame:
79+
"""This function creates an artificial dataset of features that represents the missing colocalisation to the neighbourhood genes.
80+
81+
Args:
82+
feature_name (str): The name of the feature to extend
83+
local_features (DataFrame): The dataframe of features to extend
84+
variant_index (VariantIndex): Variant index containing all variant/gene relationships
85+
gene_index (GeneIndex): Gene index to fetch the gene information
86+
study_locus (StudyLocus): Study locus to traverse between colocalisation and variant index
87+
88+
Returns:
89+
DataFrame: Dataframe of features that include genes in the neighbourhood not present in the colocalisation results. For these genes, the feature value is set to 0.
90+
"""
91+
coding_variant_gene_lut = (
92+
variant_index.df.select(
93+
"variantId", f.explode("transcriptConsequences").alias("tc")
94+
)
95+
.select(f.col("tc.targetId").alias("geneId"), "variantId")
96+
.join(gene_index.df.select("geneId", "biotype"), "geneId", "left")
97+
.filter(f.col("biotype") == "protein_coding")
98+
.drop("biotype")
99+
.distinct()
100+
)
101+
local_features_w_variant = local_features.join(
102+
study_locus.df.select("studyLocusId", "variantId"), "studyLocusId"
103+
)
104+
return (
105+
# Get the genes that are not present in the colocalisation results
106+
coding_variant_gene_lut.join(
107+
local_features_w_variant, ["variantId", "geneId"], "left_anti"
108+
)
109+
# We now link the missing variant/gene to the study locus from the original dataframe
110+
.join(
111+
local_features_w_variant.select("studyLocusId", "variantId").distinct(),
112+
"variantId",
113+
)
114+
.drop("variantId")
115+
# Fill the information for missing genes with 0
116+
.withColumn(feature_name, f.lit(0.0))
117+
)
118+
119+
71120
def common_neighbourhood_colocalisation_feature_logic(
72121
study_loci_to_annotate: StudyLocus | L2GGoldStandard,
73122
colocalisation_method: str,
@@ -79,6 +128,7 @@ def common_neighbourhood_colocalisation_feature_logic(
79128
study_index: StudyIndex,
80129
gene_index: GeneIndex,
81130
study_locus: StudyLocus,
131+
variant_index: VariantIndex,
82132
) -> DataFrame:
83133
"""Wrapper to call the logic that creates a type of colocalisation features.
84134
@@ -92,6 +142,7 @@ def common_neighbourhood_colocalisation_feature_logic(
92142
study_index (StudyIndex): Study index to fetch study type and gene
93143
gene_index (GeneIndex): Gene index to add gene type
94144
study_locus (StudyLocus): Study locus to traverse between colocalisation and study index
145+
variant_index (VariantIndex): Variant index to annotate all overlapping genes
95146
96147
Returns:
97148
DataFrame: Feature annotation in long format with the columns: studyLocusId, geneId, featureName, featureValue
@@ -107,11 +158,23 @@ def common_neighbourhood_colocalisation_feature_logic(
107158
colocalisation=colocalisation,
108159
study_index=study_index,
109160
study_locus=study_locus,
110-
).join(gene_index.df.select("geneId", "biotype"), "geneId", "left")
161+
)
162+
extended_local_max = local_max.unionByName(
163+
extend_missing_colocalisation_to_neighbourhood_genes(
164+
local_feature_name,
165+
local_max,
166+
variant_index,
167+
gene_index,
168+
study_locus,
169+
)
170+
)
111171
# Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus)
112172
# (non protein coding genes in the vicinity are excluded see #3552)
113173
regional_mean_per_study_locus = (
114-
local_max.filter(f.col("biotype") == "protein_coding")
174+
extended_local_max.join(
175+
gene_index.df.select("geneId", "biotype"), "geneId", "left"
176+
)
177+
.filter(f.col("biotype") == "protein_coding")
115178
.groupBy("studyLocusId")
116179
.agg(f.mean(local_feature_name).alias("regional_mean"))
117180
)
@@ -121,7 +184,7 @@ def common_neighbourhood_colocalisation_feature_logic(
121184
feature_name,
122185
f.col(local_feature_name) - f.coalesce(f.col("regional_mean"), f.lit(0.0)),
123186
)
124-
.drop("regional_mean", local_feature_name, "biotype")
187+
.drop("regional_mean", local_feature_name)
125188
)
126189

127190

@@ -171,7 +234,13 @@ def compute(
171234
class EQtlColocClppMaximumNeighbourhoodFeature(L2GFeature):
172235
"""Max CLPP for each (study, locus) aggregating over all eQTLs."""
173236

174-
feature_dependency_type = [Colocalisation, StudyIndex, GeneIndex, StudyLocus]
237+
feature_dependency_type = [
238+
Colocalisation,
239+
StudyIndex,
240+
GeneIndex,
241+
StudyLocus,
242+
VariantIndex,
243+
]
175244
feature_name = "eQtlColocClppMaximumNeighbourhood"
176245

177246
@classmethod
@@ -256,7 +325,13 @@ def compute(
256325
class PQtlColocClppMaximumNeighbourhoodFeature(L2GFeature):
257326
"""Max CLPP for each (study, locus, gene) aggregating over all pQTLs."""
258327

259-
feature_dependency_type = [Colocalisation, StudyIndex, GeneIndex, StudyLocus]
328+
feature_dependency_type = [
329+
Colocalisation,
330+
StudyIndex,
331+
GeneIndex,
332+
StudyLocus,
333+
VariantIndex,
334+
]
260335
feature_name = "pQtlColocClppMaximumNeighbourhood"
261336

262337
@classmethod
@@ -340,7 +415,13 @@ def compute(
340415
class SQtlColocClppMaximumNeighbourhoodFeature(L2GFeature):
341416
"""Max CLPP for each (study, locus, gene) aggregating over all sQTLs."""
342417

343-
feature_dependency_type = [Colocalisation, StudyIndex, GeneIndex, StudyLocus]
418+
feature_dependency_type = [
419+
Colocalisation,
420+
StudyIndex,
421+
GeneIndex,
422+
StudyLocus,
423+
VariantIndex,
424+
]
344425
feature_name = "sQtlColocClppMaximumNeighbourhood"
345426

346427
@classmethod
@@ -424,7 +505,13 @@ def compute(
424505
class EQtlColocH4MaximumNeighbourhoodFeature(L2GFeature):
425506
"""Max H4 for each (study, locus) aggregating over all eQTLs."""
426507

427-
feature_dependency_type = [Colocalisation, StudyIndex, GeneIndex, StudyLocus]
508+
feature_dependency_type = [
509+
Colocalisation,
510+
StudyIndex,
511+
GeneIndex,
512+
StudyLocus,
513+
VariantIndex,
514+
]
428515
feature_name = "eQtlColocH4MaximumNeighbourhood"
429516

430517
@classmethod
@@ -508,7 +595,13 @@ def compute(
508595
class PQtlColocH4MaximumNeighbourhoodFeature(L2GFeature):
509596
"""Max H4 for each (study, locus) aggregating over all pQTLs."""
510597

511-
feature_dependency_type = [Colocalisation, StudyIndex, GeneIndex, StudyLocus]
598+
feature_dependency_type = [
599+
Colocalisation,
600+
StudyIndex,
601+
GeneIndex,
602+
StudyLocus,
603+
VariantIndex,
604+
]
512605
feature_name = "pQtlColocH4MaximumNeighbourhood"
513606

514607
@classmethod
@@ -592,7 +685,13 @@ def compute(
592685
class SQtlColocH4MaximumNeighbourhoodFeature(L2GFeature):
593686
"""Max H4 for each (study, locus) aggregating over all sQTLs."""
594687

595-
feature_dependency_type = [Colocalisation, StudyIndex, GeneIndex, StudyLocus]
688+
feature_dependency_type = [
689+
Colocalisation,
690+
StudyIndex,
691+
GeneIndex,
692+
StudyLocus,
693+
VariantIndex,
694+
]
596695
feature_name = "sQtlColocH4MaximumNeighbourhood"
597696

598697
@classmethod

src/gentropy/dataset/l2g_features/vep.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def common_neighbourhood_vep_feature_logic(
114114
feature_name,
115115
f.col(local_feature_name) - f.coalesce(f.col("regional_mean"), f.lit(0.0)),
116116
)
117-
.drop("regional_metric", local_feature_name, "biotype")
117+
.drop("regional_mean", local_feature_name, "biotype")
118118
)
119119

120120

0 commit comments

Comments
 (0)