13
13
from gentropy .dataset .l2g_gold_standard import L2GGoldStandard
14
14
from gentropy .dataset .study_index import StudyIndex
15
15
from gentropy .dataset .study_locus import StudyLocus
16
+ from gentropy .dataset .variant_index import VariantIndex
16
17
17
18
if TYPE_CHECKING :
18
19
from pyspark .sql import DataFrame
@@ -68,6 +69,54 @@ def common_colocalisation_feature_logic(
68
69
)
69
70
70
71
72
+ def extend_missing_colocalisation_to_neighbourhood_genes (
73
+ feature_name : str ,
74
+ local_features : DataFrame ,
75
+ variant_index : VariantIndex ,
76
+ gene_index : GeneIndex ,
77
+ study_locus : StudyLocus ,
78
+ ) -> DataFrame :
79
+ """This function creates an artificial dataset of features that represents the missing colocalisation to the neighbourhood genes.
80
+
81
+ Args:
82
+ feature_name (str): The name of the feature to extend
83
+ local_features (DataFrame): The dataframe of features to extend
84
+ variant_index (VariantIndex): Variant index containing all variant/gene relationships
85
+ gene_index (GeneIndex): Gene index to fetch the gene information
86
+ study_locus (StudyLocus): Study locus to traverse between colocalisation and variant index
87
+
88
+ Returns:
89
+ DataFrame: Dataframe of features that include genes in the neighbourhood not present in the colocalisation results. For these genes, the feature value is set to 0.
90
+ """
91
+ coding_variant_gene_lut = (
92
+ variant_index .df .select (
93
+ "variantId" , f .explode ("transcriptConsequences" ).alias ("tc" )
94
+ )
95
+ .select (f .col ("tc.targetId" ).alias ("geneId" ), "variantId" )
96
+ .join (gene_index .df .select ("geneId" , "biotype" ), "geneId" , "left" )
97
+ .filter (f .col ("biotype" ) == "protein_coding" )
98
+ .drop ("biotype" )
99
+ .distinct ()
100
+ )
101
+ local_features_w_variant = local_features .join (
102
+ study_locus .df .select ("studyLocusId" , "variantId" ), "studyLocusId"
103
+ )
104
+ return (
105
+ # Get the genes that are not present in the colocalisation results
106
+ coding_variant_gene_lut .join (
107
+ local_features_w_variant , ["variantId" , "geneId" ], "left_anti"
108
+ )
109
+ # We now link the missing variant/gene to the study locus from the original dataframe
110
+ .join (
111
+ local_features_w_variant .select ("studyLocusId" , "variantId" ).distinct (),
112
+ "variantId" ,
113
+ )
114
+ .drop ("variantId" )
115
+ # Fill the information for missing genes with 0
116
+ .withColumn (feature_name , f .lit (0.0 ))
117
+ )
118
+
119
+
71
120
def common_neighbourhood_colocalisation_feature_logic (
72
121
study_loci_to_annotate : StudyLocus | L2GGoldStandard ,
73
122
colocalisation_method : str ,
@@ -79,6 +128,7 @@ def common_neighbourhood_colocalisation_feature_logic(
79
128
study_index : StudyIndex ,
80
129
gene_index : GeneIndex ,
81
130
study_locus : StudyLocus ,
131
+ variant_index : VariantIndex ,
82
132
) -> DataFrame :
83
133
"""Wrapper to call the logic that creates a type of colocalisation features.
84
134
@@ -92,6 +142,7 @@ def common_neighbourhood_colocalisation_feature_logic(
92
142
study_index (StudyIndex): Study index to fetch study type and gene
93
143
gene_index (GeneIndex): Gene index to add gene type
94
144
study_locus (StudyLocus): Study locus to traverse between colocalisation and study index
145
+ variant_index (VariantIndex): Variant index to annotate all overlapping genes
95
146
96
147
Returns:
97
148
DataFrame: Feature annotation in long format with the columns: studyLocusId, geneId, featureName, featureValue
@@ -107,11 +158,23 @@ def common_neighbourhood_colocalisation_feature_logic(
107
158
colocalisation = colocalisation ,
108
159
study_index = study_index ,
109
160
study_locus = study_locus ,
110
- ).join (gene_index .df .select ("geneId" , "biotype" ), "geneId" , "left" )
161
+ )
162
+ extended_local_max = local_max .unionByName (
163
+ extend_missing_colocalisation_to_neighbourhood_genes (
164
+ local_feature_name ,
165
+ local_max ,
166
+ variant_index ,
167
+ gene_index ,
168
+ study_locus ,
169
+ )
170
+ )
111
171
# Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus)
112
172
# (non protein coding genes in the vicinity are excluded see #3552)
113
173
regional_mean_per_study_locus = (
114
- local_max .filter (f .col ("biotype" ) == "protein_coding" )
174
+ extended_local_max .join (
175
+ gene_index .df .select ("geneId" , "biotype" ), "geneId" , "left"
176
+ )
177
+ .filter (f .col ("biotype" ) == "protein_coding" )
115
178
.groupBy ("studyLocusId" )
116
179
.agg (f .mean (local_feature_name ).alias ("regional_mean" ))
117
180
)
@@ -121,7 +184,7 @@ def common_neighbourhood_colocalisation_feature_logic(
121
184
feature_name ,
122
185
f .col (local_feature_name ) - f .coalesce (f .col ("regional_mean" ), f .lit (0.0 )),
123
186
)
124
- .drop ("regional_mean" , local_feature_name , "biotype" )
187
+ .drop ("regional_mean" , local_feature_name )
125
188
)
126
189
127
190
@@ -171,7 +234,13 @@ def compute(
171
234
class EQtlColocClppMaximumNeighbourhoodFeature (L2GFeature ):
172
235
"""Max CLPP for each (study, locus) aggregating over all eQTLs."""
173
236
174
- feature_dependency_type = [Colocalisation , StudyIndex , GeneIndex , StudyLocus ]
237
+ feature_dependency_type = [
238
+ Colocalisation ,
239
+ StudyIndex ,
240
+ GeneIndex ,
241
+ StudyLocus ,
242
+ VariantIndex ,
243
+ ]
175
244
feature_name = "eQtlColocClppMaximumNeighbourhood"
176
245
177
246
@classmethod
@@ -256,7 +325,13 @@ def compute(
256
325
class PQtlColocClppMaximumNeighbourhoodFeature (L2GFeature ):
257
326
"""Max CLPP for each (study, locus, gene) aggregating over all pQTLs."""
258
327
259
- feature_dependency_type = [Colocalisation , StudyIndex , GeneIndex , StudyLocus ]
328
+ feature_dependency_type = [
329
+ Colocalisation ,
330
+ StudyIndex ,
331
+ GeneIndex ,
332
+ StudyLocus ,
333
+ VariantIndex ,
334
+ ]
260
335
feature_name = "pQtlColocClppMaximumNeighbourhood"
261
336
262
337
@classmethod
@@ -340,7 +415,13 @@ def compute(
340
415
class SQtlColocClppMaximumNeighbourhoodFeature (L2GFeature ):
341
416
"""Max CLPP for each (study, locus, gene) aggregating over all sQTLs."""
342
417
343
- feature_dependency_type = [Colocalisation , StudyIndex , GeneIndex , StudyLocus ]
418
+ feature_dependency_type = [
419
+ Colocalisation ,
420
+ StudyIndex ,
421
+ GeneIndex ,
422
+ StudyLocus ,
423
+ VariantIndex ,
424
+ ]
344
425
feature_name = "sQtlColocClppMaximumNeighbourhood"
345
426
346
427
@classmethod
@@ -424,7 +505,13 @@ def compute(
424
505
class EQtlColocH4MaximumNeighbourhoodFeature (L2GFeature ):
425
506
"""Max H4 for each (study, locus) aggregating over all eQTLs."""
426
507
427
- feature_dependency_type = [Colocalisation , StudyIndex , GeneIndex , StudyLocus ]
508
+ feature_dependency_type = [
509
+ Colocalisation ,
510
+ StudyIndex ,
511
+ GeneIndex ,
512
+ StudyLocus ,
513
+ VariantIndex ,
514
+ ]
428
515
feature_name = "eQtlColocH4MaximumNeighbourhood"
429
516
430
517
@classmethod
@@ -508,7 +595,13 @@ def compute(
508
595
class PQtlColocH4MaximumNeighbourhoodFeature (L2GFeature ):
509
596
"""Max H4 for each (study, locus) aggregating over all pQTLs."""
510
597
511
- feature_dependency_type = [Colocalisation , StudyIndex , GeneIndex , StudyLocus ]
598
+ feature_dependency_type = [
599
+ Colocalisation ,
600
+ StudyIndex ,
601
+ GeneIndex ,
602
+ StudyLocus ,
603
+ VariantIndex ,
604
+ ]
512
605
feature_name = "pQtlColocH4MaximumNeighbourhood"
513
606
514
607
@classmethod
@@ -592,7 +685,13 @@ def compute(
592
685
class SQtlColocH4MaximumNeighbourhoodFeature (L2GFeature ):
593
686
"""Max H4 for each (study, locus) aggregating over all sQTLs."""
594
687
595
- feature_dependency_type = [Colocalisation , StudyIndex , GeneIndex , StudyLocus ]
688
+ feature_dependency_type = [
689
+ Colocalisation ,
690
+ StudyIndex ,
691
+ GeneIndex ,
692
+ StudyLocus ,
693
+ VariantIndex ,
694
+ ]
596
695
feature_name = "sQtlColocH4MaximumNeighbourhood"
597
696
598
697
@classmethod
0 commit comments