opentargets · xyg123 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/docs/python_api/datasets/l2g_features/intervals.md b/docs/python_api/datasets/l2g_features/intervals.md
@@ -0,0 +1,17 @@
+---
+title: From intervals
+---
+
+## List of features
+
+::: gentropy.dataset.l2g_features.intervals.PchicMeanFeature
+::: gentropy.dataset.l2g_features.intervals.PchicMeanNeighbourhoodFeature
+::: gentropy.dataset.l2g_features.intervals.EnhTssCorrelationMeanFeature
+::: gentropy.dataset.l2g_features.intervals.EnhTssCorrelationMeanNeighbourhoodFeature
+::: gentropy.dataset.l2g_features.intervals.DhsPmtrCorrelationMeanFeature
+::: gentropy.dataset.l2g_features.intervals.DhsPmtrCorrelationMeanNeighbourhoodFeature
+
+## Common logic
+
+::: gentropy.dataset.l2g_features.intervals.common_interval_feature_logic
+::: gentropy.dataset.l2g_features.intervals.common_neighbourhood_interval_feature_logic
diff --git a/src/gentropy/assets/schemas/intervals.json b/src/gentropy/assets/schemas/intervals.json
@@ -18,10 +18,16 @@
       "nullable": false,
       "type": "string"
     },
+    {
+      "metadata": {},
+      "name": "variantId",
+      "nullable": true,
+      "type": "string"
+    },
     {
       "metadata": {},
       "name": "geneId",
-      "nullable": false,
+      "nullable": true,
       "type": "string"
     },
     {

diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -17,8 +17,7 @@ class SessionConfig:
     write_mode: str = "errorifexists"
     spark_uri: str = "local[*]"
     hail_home: str = os.path.dirname(hail_location)
-    extended_spark_conf: dict[str, str] | None = field(
-        default_factory=dict[str, str])
+    extended_spark_conf: dict[str, str] | None = field(default_factory=dict[str, str])
     output_partitions: int = 200
     _target_: str = "gentropy.common.session.Session"
 
@@ -40,8 +39,7 @@ class ColocalisationConfig(StepConfig):
     credible_set_path: str = MISSING
     coloc_path: str = MISSING
     colocalisation_method: str = MISSING
-    colocalisation_method_params: dict[str, Any] = field(
-        default_factory=dict[str, Any])
+    colocalisation_method_params: dict[str, Any] = field(default_factory=dict[str, Any])
     _target_: str = "gentropy.colocalisation.ColocalisationStep"
 
 
@@ -126,8 +124,7 @@ class EqtlCatalogueConfig(StepConfig):
     eqtl_catalogue_paths_imported: str = MISSING
     eqtl_catalogue_study_index_out: str = MISSING
     eqtl_catalogue_credible_sets_out: str = MISSING
-    mqtl_quantification_methods_blacklist: list[str] = field(
-        default_factory=lambda: [])
+    mqtl_quantification_methods_blacklist: list[str] = field(default_factory=lambda: [])
     eqtl_lead_pvalue_threshold: float = 1e-3
     _target_: str = "gentropy.eqtl_catalogue.EqtlCatalogueStep"
 
@@ -217,6 +214,18 @@ class LDBasedClumpingConfig(StepConfig):
     _target_: str = "gentropy.ld_based_clumping.LDBasedClumpingStep"
 
 
+@dataclass
+class IntervalConfig(StepConfig):
+    """Interval step configuration."""
+
+    gene_index_path: str = MISSING
+    liftover_chain_file_path: str = MISSING
+    max_distance: int = 250_000
+    interval_sources: dict[str, str] = MISSING
+    processed_interval_path: str = MISSING
+    _target_: str = "gentropy.intervals.IntervalStep"
+
+
 @dataclass
 class LocusToGeneConfig(StepConfig):
     """Locus to gene step configuration."""
@@ -263,6 +272,13 @@ class LocusToGeneConfig(StepConfig):
             "vepMaximumNeighbourhood",
             "vepMean",
             "vepMeanNeighbourhood",
+            # intervals
+            "pchicMean",
+            "pchicMeanNeighbourhood",
+            "enhTssCorrelationMean",
+            "enhTssCorrelationMeanNeighbourhood",
+            "dhsPmtrCorrelationMean",
+            "dhsPmtrCorrelationMeanNeighbourhood",
             # other
             "geneCount500kb",
             "proteinGeneCount500kb",
@@ -306,6 +322,7 @@ class LocusToGeneFeatureMatrixConfig(StepConfig):
     colocalisation_path: str | None = None
     study_index_path: str | None = None
     gene_index_path: str | None = None
+    interval_path: str | None = None
     feature_matrix_path: str = MISSING
     features_list: list[str] = field(
         default_factory=lambda: [
@@ -340,6 +357,13 @@ class LocusToGeneFeatureMatrixConfig(StepConfig):
             "vepMaximumNeighbourhood",
             "vepMean",
             "vepMeanNeighbourhood",
+            # intervals
+            "pchicMean",
+            "pchicMeanNeighbourhood",
+            "enhTssCorrelationMean",
+            "enhTssCorrelationMeanNeighbourhood",
+            "dhsPmtrCorrelationMean",
+            "dhsPmtrCorrelationMeanNeighbourhood",
             # other
             "geneCount500kb",
             "proteinGeneCount500kb",
@@ -681,8 +705,7 @@ class Config:
     """Application configuration."""
 
     # this is unfortunately verbose due to @dataclass limitations
-    defaults: List[Any] = field(default_factory=lambda: [
-                                "_self_", {"step": MISSING}])
+    defaults: List[Any] = field(default_factory=lambda: ["_self_", {"step": MISSING}])
     step: StepConfig = MISSING
     datasets: dict[str, str] = field(default_factory=dict)
 
@@ -716,8 +739,7 @@ def register_config() -> None:
         name="gwas_catalog_top_hit_ingestion",
         node=GWASCatalogTopHitIngestionConfig,
     )
-    cs.store(group="step", name="ld_based_clumping",
-             node=LDBasedClumpingConfig)
+    cs.store(group="step", name="ld_based_clumping", node=LDBasedClumpingConfig)
     cs.store(group="step", name="ld_index", node=LDIndexConfig)
     cs.store(group="step", name="locus_to_gene", node=LocusToGeneConfig)
     cs.store(
@@ -735,8 +757,7 @@ def register_config() -> None:
 
     cs.store(group="step", name="pics", node=PICSConfig)
     cs.store(group="step", name="gnomad_variants", node=GnomadVariantConfig)
-    cs.store(group="step", name="ukb_ppp_eur_sumstat_preprocess",
-             node=UkbPppEurConfig)
+    cs.store(group="step", name="ukb_ppp_eur_sumstat_preprocess", node=UkbPppEurConfig)
     cs.store(group="step", name="variant_index", node=VariantIndexConfig)
     cs.store(group="step", name="variant_to_vcf", node=ConvertToVcfStepConfig)
     cs.store(
@@ -769,7 +790,5 @@ def register_config() -> None:
         name="locus_to_gene_associations",
         node=LocusToGeneAssociationsStepConfig,
     )
-    cs.store(group="step", name="finngen_ukb_meta_ingestion",
-             node=FinngenUkbMetaConfig)
-    cs.store(group="step", name="credible_set_qc",
-             node=CredibleSetQCStepConfig)
+    cs.store(group="step", name="finngen_ukb_meta_ingestion", node=FinngenUkbMetaConfig)
+    cs.store(group="step", name="credible_set_qc", node=CredibleSetQCStepConfig)
diff --git a/src/gentropy/dataset/intervals.py b/src/gentropy/dataset/intervals.py
@@ -5,17 +5,19 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
+import pyspark.sql.functions as f
+
 from gentropy.common.Liftover import LiftOverSpark
 from gentropy.common.schemas import parse_spark_schema
 from gentropy.dataset.dataset import Dataset
 from gentropy.dataset.gene_index import GeneIndex
+from gentropy.dataset.variant_index import VariantIndex
 
 if TYPE_CHECKING:
     from pyspark.sql import SparkSession
     from pyspark.sql.types import StructType
 
 
-
 @dataclass
 class Intervals(Dataset):
     """Intervals dataset links genes to genomic regions based on genome interaction studies."""
@@ -71,3 +73,35 @@ def from_source(
         source_class = source_to_class[source_name]
         data = source_class.read(spark, source_path)  # type: ignore
         return source_class.parse(data, gene_index, lift)  # type: ignore
+
+    def overlap_variant_index(
+        self: Intervals, variant_index: VariantIndex
+    ) -> Intervals:
+        """Overlaps intervals with a variant index.
+
+        Args:
+            variant_index (VariantIndex): Variant index dataset
+
+        Returns:
+            Intervals: Variant-to-gene intervals dataset
+        """
+        return Intervals(
+            _df=(
+                self.df.alias("interval")
+                .join(
+                    variant_index.df.selectExpr(
+                        "chromosome as vi_chromosome", "variantId", "position"
+                    ).alias("vi"),
+                    on=[
+                        f.col("vi.vi_chromosome") == f.col("interval.chromosome"),
+                        f.col("vi.position").between(
+                            f.col("interval.start"), f.col("interval.end")
+                        ),
+                    ],
+                    how="inner",
+                )
+                .drop("vi_chromosome", "position")
+                # .drop("start", "end", "vi_chromosome", "position")
+            ),
+            _schema=Intervals.get_schema(),
+        )