feat(l2g): extend colocalisation neighbourhood metrics to missing genes in the vicinity (#851)

ireneisdoomed · web-flow · commit 7b20d55275ba · 2024-10-21T15:20:48.000+01:00
* feat(l2g): wip - limit colocalisation neighbourhood to protein coding genes

* feat: adjust logic in `common_neighbourhood_colocalisation_feature_logic`

* fix: correct logic in `common_neighbourhood_vep_feature_logic`

* chore: adjust tests

* chore: wip

* feat: add `extend_missing_colocalisation_to_neighbourhood_genes`

* feat: add variant index as a dependency to the colocalisation nbh features

* test: add `test_extend_missing_colocalisation_to_neighbourhood_genes`

* chore: minor

* chore: fixes tests
diff --git a/docs/python_api/datasets/l2g_features/colocalisation.md b/docs/python_api/datasets/l2g_features/colocalisation.md
@@ -20,4 +20,5 @@ title: From colocalisation
 ## Common logic
 
 ::: gentropy.dataset.l2g_features.colocalisation.common_colocalisation_feature_logic
+::: gentropy.dataset.l2g_features.colocalisation.extend_missing_colocalisation_to_neighbourhood_genes
 ::: gentropy.dataset.l2g_features.colocalisation.common_neighbourhood_colocalisation_feature_logic
diff --git a/src/gentropy/dataset/l2g_features/colocalisation.py b/src/gentropy/dataset/l2g_features/colocalisation.py
@@ -13,6 +13,7 @@
 from gentropy.dataset.l2g_gold_standard import L2GGoldStandard
 from gentropy.dataset.study_index import StudyIndex
 from gentropy.dataset.study_locus import StudyLocus
+from gentropy.dataset.variant_index import VariantIndex
 
 if TYPE_CHECKING:
     from pyspark.sql import DataFrame
@@ -68,6 +69,54 @@ def common_colocalisation_feature_logic(
     )
 
 
+def extend_missing_colocalisation_to_neighbourhood_genes(
+    feature_name: str,
+    local_features: DataFrame,
+    variant_index: VariantIndex,
+    gene_index: GeneIndex,
+    study_locus: StudyLocus,
+) -> DataFrame:
+    """This function creates an artificial dataset of features that represents the missing colocalisation to the neighbourhood genes.
+
+    Args:
+        feature_name (str): The name of the feature to extend
+        local_features (DataFrame): The dataframe of features to extend
+        variant_index (VariantIndex): Variant index containing all variant/gene relationships
+        gene_index (GeneIndex): Gene index to fetch the gene information
+        study_locus (StudyLocus): Study locus to traverse between colocalisation and variant index
+
+    Returns:
+        DataFrame: Dataframe of features that include genes in the neighbourhood not present in the colocalisation results. For these genes, the feature value is set to 0.
+    """
+    coding_variant_gene_lut = (
+        variant_index.df.select(
+            "variantId", f.explode("transcriptConsequences").alias("tc")
+        )
+        .select(f.col("tc.targetId").alias("geneId"), "variantId")
+        .join(gene_index.df.select("geneId", "biotype"), "geneId", "left")
+        .filter(f.col("biotype") == "protein_coding")
+        .drop("biotype")
+        .distinct()
+    )
+    local_features_w_variant = local_features.join(
+        study_locus.df.select("studyLocusId", "variantId"), "studyLocusId"
+    )
+    return (
+        # Get the genes that are not present in the colocalisation results
+        coding_variant_gene_lut.join(
+            local_features_w_variant, ["variantId", "geneId"], "left_anti"
+        )
+        # We now link the missing variant/gene to the study locus from the original dataframe
+        .join(
+            local_features_w_variant.select("studyLocusId", "variantId").distinct(),
+            "variantId",
+        )
+        .drop("variantId")
+        # Fill the information for missing genes with 0
+        .withColumn(feature_name, f.lit(0.0))
+    )
+
+
 def common_neighbourhood_colocalisation_feature_logic(
     study_loci_to_annotate: StudyLocus | L2GGoldStandard,
     colocalisation_method: str,
@@ -79,6 +128,7 @@ def common_neighbourhood_colocalisation_feature_logic(
     study_index: StudyIndex,
     gene_index: GeneIndex,
     study_locus: StudyLocus,
+    variant_index: VariantIndex,
 ) -> DataFrame:
     """Wrapper to call the logic that creates a type of colocalisation features.
 
@@ -92,6 +142,7 @@ def common_neighbourhood_colocalisation_feature_logic(
         study_index (StudyIndex): Study index to fetch study type and gene
         gene_index (GeneIndex): Gene index to add gene type
         study_locus (StudyLocus): Study locus to traverse between colocalisation and study index
+        variant_index (VariantIndex): Variant index to annotate all overlapping genes
 
     Returns:
         DataFrame: Feature annotation in long format with the columns: studyLocusId, geneId, featureName, featureValue
@@ -107,11 +158,23 @@ def common_neighbourhood_colocalisation_feature_logic(
         colocalisation=colocalisation,
         study_index=study_index,
         study_locus=study_locus,
-    ).join(gene_index.df.select("geneId", "biotype"), "geneId", "left")
+    )
+    extended_local_max = local_max.unionByName(
+        extend_missing_colocalisation_to_neighbourhood_genes(
+            local_feature_name,
+            local_max,
+            variant_index,
+            gene_index,
+            study_locus,
+        )
+    )
     # Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus)
     # (non protein coding genes in the vicinity are excluded see #3552)
     regional_mean_per_study_locus = (
-        local_max.filter(f.col("biotype") == "protein_coding")
+        extended_local_max.join(
+            gene_index.df.select("geneId", "biotype"), "geneId", "left"
+        )
+        .filter(f.col("biotype") == "protein_coding")
         .groupBy("studyLocusId")
         .agg(f.mean(local_feature_name).alias("regional_mean"))
     )
@@ -121,7 +184,7 @@ def common_neighbourhood_colocalisation_feature_logic(
             feature_name,
             f.col(local_feature_name) - f.coalesce(f.col("regional_mean"), f.lit(0.0)),
         )
-        .drop("regional_mean", local_feature_name, "biotype")
+        .drop("regional_mean", local_feature_name)
     )
 
 
@@ -171,7 +234,13 @@ def compute(
 class EQtlColocClppMaximumNeighbourhoodFeature(L2GFeature):
     """Max CLPP for each (study, locus) aggregating over all eQTLs."""
 
-    feature_dependency_type = [Colocalisation, StudyIndex, GeneIndex, StudyLocus]
+    feature_dependency_type = [
+        Colocalisation,
+        StudyIndex,
+        GeneIndex,
+        StudyLocus,
+        VariantIndex,
+    ]
     feature_name = "eQtlColocClppMaximumNeighbourhood"
 
     @classmethod
@@ -256,7 +325,13 @@ def compute(
 class PQtlColocClppMaximumNeighbourhoodFeature(L2GFeature):
     """Max CLPP for each (study, locus, gene) aggregating over all pQTLs."""
 
-    feature_dependency_type = [Colocalisation, StudyIndex, GeneIndex, StudyLocus]
+    feature_dependency_type = [
+        Colocalisation,
+        StudyIndex,
+        GeneIndex,
+        StudyLocus,
+        VariantIndex,
+    ]
     feature_name = "pQtlColocClppMaximumNeighbourhood"
 
     @classmethod
@@ -340,7 +415,13 @@ def compute(
 class SQtlColocClppMaximumNeighbourhoodFeature(L2GFeature):
     """Max CLPP for each (study, locus, gene) aggregating over all sQTLs."""
 
-    feature_dependency_type = [Colocalisation, StudyIndex, GeneIndex, StudyLocus]
+    feature_dependency_type = [
+        Colocalisation,
+        StudyIndex,
+        GeneIndex,
+        StudyLocus,
+        VariantIndex,
+    ]
     feature_name = "sQtlColocClppMaximumNeighbourhood"
 
     @classmethod
@@ -424,7 +505,13 @@ def compute(
 class EQtlColocH4MaximumNeighbourhoodFeature(L2GFeature):
     """Max H4 for each (study, locus) aggregating over all eQTLs."""
 
-    feature_dependency_type = [Colocalisation, StudyIndex, GeneIndex, StudyLocus]
+    feature_dependency_type = [
+        Colocalisation,
+        StudyIndex,
+        GeneIndex,
+        StudyLocus,
+        VariantIndex,
+    ]
     feature_name = "eQtlColocH4MaximumNeighbourhood"
 
     @classmethod
@@ -508,7 +595,13 @@ def compute(
 class PQtlColocH4MaximumNeighbourhoodFeature(L2GFeature):
     """Max H4 for each (study, locus) aggregating over all pQTLs."""
 
-    feature_dependency_type = [Colocalisation, StudyIndex, GeneIndex, StudyLocus]
+    feature_dependency_type = [
+        Colocalisation,
+        StudyIndex,
+        GeneIndex,
+        StudyLocus,
+        VariantIndex,
+    ]
     feature_name = "pQtlColocH4MaximumNeighbourhood"
 
     @classmethod
@@ -592,7 +685,13 @@ def compute(
 class SQtlColocH4MaximumNeighbourhoodFeature(L2GFeature):
     """Max H4 for each (study, locus) aggregating over all sQTLs."""
 
-    feature_dependency_type = [Colocalisation, StudyIndex, GeneIndex, StudyLocus]
+    feature_dependency_type = [
+        Colocalisation,
+        StudyIndex,
+        GeneIndex,
+        StudyLocus,
+        VariantIndex,
+    ]
     feature_name = "sQtlColocH4MaximumNeighbourhood"
 
     @classmethod
diff --git a/src/gentropy/dataset/l2g_features/vep.py b/src/gentropy/dataset/l2g_features/vep.py
@@ -114,7 +114,7 @@ def common_neighbourhood_vep_feature_logic(
             feature_name,
             f.col(local_feature_name) - f.coalesce(f.col("regional_mean"), f.lit(0.0)),
         )
-        .drop("regional_metric", local_feature_name, "biotype")
+        .drop("regional_mean", local_feature_name, "biotype")
     )
 
 
diff --git a/tests/gentropy/dataset/test_l2g_feature.py b/tests/gentropy/dataset/test_l2g_feature.py

Original file line number	Diff line number	Diff line change
`@@ -114,7 +114,7 @@ def common_neighbourhood_vep_feature_logic(`
`114`	`114`	`feature_name,`
`115`	`115`	`f.col(local_feature_name) - f.coalesce(f.col("regional_mean"), f.lit(0.0)),`
`116`	`116`	`)`
`117`		`- .drop("regional_metric", local_feature_name, "biotype")`
	`117`	`+ .drop("regional_mean", local_feature_name, "biotype")`
`118`	`118`	`)`
`119`	`119`
`120`	`120`